根据地理位置和关键词爬取twitter数据存入MongoDB并生成词云

转载注明出处

tweepy获取数据
生成词云

tweepy获取数据

1. 建立model model.py

class twitter_post(Document):
    _id = ObjectIdField(primary_key = True)
    screen_name = StringField(max_length = 128)
    text = StringField(required = True, max_length = 2048)
    text_id = IntField(required = True)
    created_at = DateTimeField(required = True)
    in_reply_to_screen_name = StringField(max_length = 64)
    retweet_count = IntField()
    favorite_count = IntField()
    source = StringField(max_length = 1024)
    longitude = StringField(max_length = 32)
    latitude = StringField(max_length = 32)
    location = StringField(max_length = 256)
    country_code = StringField(max_length = 64)
    lang = StringField(max_length = 4)
    time_zone = StringField(max_length = 64)
    province = StringField(max_length = 64)
    city = StringField(max_length = 64)
    district = StringField(max_length = 64)
    street = StringField(max_length = 64)
    street_number = StringField(max_length = 64)

    meta = {
        'ordering': ['created_at','screen_name'],
        'collection': 'twitter_posts'
    }

2. 访问百度地图接口根据经纬度拿到省市街道信息

import requests
def GetAddress(lon,lat):
    url = 'http://api.map.baidu.com/geocoder/v2/'
    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
    payload = { 'output':'json', 'ak':'pAjezQsQBe8v1c1Lel87r4vprwXiGCEn' }
    payload['location'] = '{0:s},{1:s}'.format(str(lon),str(lat))
    print(lon,lat)
    content = requests.get(url,params=payload,headers=header).json()
    try:
        content = requests.get(url,params=payload,headers=header).json()
        content = content['result']['addressComponent']
        if content['street'] == None:#有一些地理位置街道信息拿不到
            content['street'] = 'NULL'
        if content['street_number'] == None:
            content['street_number'] = 'NULL'
    except:
        content["province"]="NULL"
        content["city"]="NULL"
        content["district"]="NULL"
        content["street"]="NULL"
        content["street_number"]="NULL"
    return content
print(GetAddress(40.07571952, 116.60609467))

下面是三组经纬度拿到的地理位置信息
三个经纬度拿到的信息

3. 访问tweepy开放的接口爬取数据

consumer_key = 'I1XowkiAc72fEp2CXPv0'
consumer_secret = 'drfnZHVUQrq1dyeqepCrbKyGWeYJCeTFQZpkLcXkgKFw3P'
access_key = '936432882482143235-jNLGPsCpZaSqR1D2WarSEshgQcyi'
access_secret = 'YF4ddleSgGxj8BsfmH2DELr7TsNNKAp08ZvqC'

# consumer_key = 'qEgHKHnL55g7k4U9xih'
# consumer_secret= 'QcUDHJS04wK5hrmlxV5C4gweiRPDca9JQoc4gp7ft'
# access_key= '863573499436122112-LA60oJLBzwVnhZjGOUPzRsJc'
# access_secret= '8CKFpp6qyxkAk1KfjWJPoHKloppPrvd7Tjiwllyk'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth_handler=auth, parser=JSONParser(), proxy = '127.0.0.1:1080', wait_on_rate_limit=True)

conn = connect('ANXIETY', alias='default', host='localhost', port=27017, username='', password='')#连接本地mongoDB

由于tweepy只提供过去一周的数据，而且每执行一次api.search()接口只会最多返回100条数据，而tweepy官方生成最多可以连续请求450次左右，因此我们大概最多可以拿到4万多数据，为了拿到尽可能多的数据，我们设置MAX_QUERIES进行多次查询,，本程序只爬取北京，上海，港澳台数据，爬取的主题在line.csv文件中：

regions = ["beijing","shanghai","hongkong","macau","taiwan"]
for line in open("normal_user.csv"):
    #try:
        #u = api.get_user(line)
    #ms = myStream.filter(track=[line])
    #print(results[0])
    for r in regions:#对每一个地理位置
        places = api.geo_search(query=r)["result"]["places"][0]#首先获取地理位置ID
        print(places)
        place_id = places["id"]
        tweet_id = []
        i = MAX_QUERIES
        MAX_ID = 10
        while i > 0:

            if MAX_ID == 10:
                #tweets = api.search(q="place:%s AND line" % place_id)#根据地理位置和关键词同时过去爬取
                for tweet in api.search(q="place:%s" % place_id,count = 100, until=until)["statuses"]:#把截止日期放到七日后同时设置没次爬取最多数目一百，保证数据量
                    #print(tweet)
                    #j.write(json.dumps(tweet)+'\n')
                    tweet_id.append(tweet["id"])
                    Obj_id = ObjectId()
                    tweet_item = twitter_post(#生成一条mongo中的数据
                        _id = Obj_id,
                        text_id = tweet["id"],
                        created_at = datetime.datetime.strptime(tweet["created_at"], GMT_FORMAT),
                        screen_name = tweet["user"]["screen_name"],
                        favorite_count = tweet["favorite_count"],
                        retweet_count = tweet["retweet_count"],
                        text = tweet["text"],
                        source = tweet["source"],
                        country_code = (tweet["place"]["country_code"] if tweet['place'] != None else 'NULL'),
                        location = tweet["user"]["location"],
                        latitude = str(tweet["coordinates"]["coordinates"][0] if tweet["coordinates"] != None else 'NULL'),#根据返回的json文件拿到维度，注意返回时纬度在前，但是访问百度接口时，经度在前
                        longitude = str(tweet["coordinates"]["coordinates"][1] if tweet["coordinates"] != None else 'NULL'),
                        time_zone = tweet["user"]["time_zone"],
                        lang = tweet["lang"],
                        province = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['province'] if tweet["coordinates"] != None else 'NULL'),
                        city = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['city'] if tweet["coordinates"] != None else 'NULL'),
                        district = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['district'] if tweet["coordinates"] != None else 'NULL'),
                        street = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street'] if tweet["coordinates"] != None else 'NULL'),
                        street_number = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street_number'] if tweet["coordinates"] != None else 'NULL')
                        )
                    try:
                        tweet_item.save()#存入数据库
                    except:
                        continue
                MAX_ID = min(tweet_id)
                #print(MAX_ID)

            else:
                for tweet in api.search(q="place:%s",count = 100, max_id = MAX_ID-1)["statuses"]:
                    #print(tweet)
                    #j.write(json.dumps(tweet)+'\n')
                    tweet_id.append(tweet["id"])
                    Obj_id = ObjectId()
                    tweet_item = twitter_post(
                        _id = Obj_id,
                        text_id = tweet["id"],
                        created_at = datetime.datetime.strptime(tweet["created_at"], GMT_FORMAT),
                        screen_name = tweet["user"]["screen_name"],
                        favorite_count = tweet["favorite_count"],
                        retweet_count = tweet["retweet_count"],
                        text = tweet["text"],
                        source = tweet["source"],
                        country_code = (tweet["place"]["country_code"] if tweet['place'] != None else 'NULL'),
                        location = tweet["user"]["location"],
                        latitude = str(tweet["coordinates"]["coordinates"][0] if tweet["coordinates"] != None else 'NULL'),
                        longitude = str(tweet["coordinates"]["coordinates"][1] if tweet["coordinates"] != None else 'NULL'),
                        time_zone = tweet["user"]["time_zone"],
                        lang = tweet["lang"],
                        province = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['province'] if tweet["coordinates"] != None else 'NULL'),
                        city = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['city'] if tweet["coordinates"] != None else 'NULL'),
                        district = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['district'] if tweet["coordinates"] != None else 'NULL'),
                        street = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street'] if tweet["coordinates"] != None else 'NULL'),
                        street_number = (GetAddress(tweet["coordinates"]["coordinates"][1],tweet["coordinates"]["coordinates"][0])['street_number'] if tweet["coordinates"] != None else 'NULL')
                        )
                    try:
                        tweet_item.save()
                    except:
                        continue
                MAX_ID = min(tweet_id)
                #print(MAX_ID)
            i -= 1

得到的数据如下
这里写图片描述

生成词云（输入：地理位置和词云的个数，返回词云）

（难点，中英文文本同时处理）
思路：根据字段lang将不同语言的文本分别处理生成词云，在进行排序合并

1. 访问数据库

class MongoConn():
    def __init__(self, db_name):
        try:
            url = '127.0.0.1:27017'
            self.client = pymongo.MongoClient(url, connect=True)
            self.db = self.client[db_name]
        except Exception as e:
            print ('连接mongo数据失败!')
            traceback.print_exc()

    def destroy(self):
        self.client.close()

    def getDb(self):
        return self.db

    def __del__(self):
        self.client.close()

2. 英文文本预处理

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if token not in _STOP_WORDS] for doc in docs]

    # Lemmatize all words in documents.

    return docs

2. 中文文本预处理

def nltk_tokenize(self,text):
        tokens = []
        # pos_tokens = []
        # entities = []
        features = []
        stop_words = stop.load_stopwords()

        try:
            #分词,去空格
            # tokens = text.split() #英语
            tokens_cut = jieba.cut(text)

            for word in tokens_cut:
                #如果不是停止词并且长度大于1小于5而且不是数字,在特征中加上这个单词
                if word not in stop_words and len(word) > 1 and len(word) < 5 and not is_number(word):        
                    #features.append(word + "." + postag)
                    features.append(word)

            for word in tokens_cut:
                tokens.append(word)
            # print 'feature here ', features
        except: pass
        return features

2. 词云生成器

class cloudProducer():

    def __init__(self):

        self.mon = MongoConn('ANXIETY')
        self.db = self.mon.getDb()

    def getMainData(self, region_type, region):
        #取最近一周的数据
        en_docs = []
        ch_docs = []

        twitter_in_english = self.db.twitter_posts.find({region_type:region,"lang":"en"})
        twitter_in_chinese = self.db.twitter_posts.find({region_type:region,"lang":"zh"})

        for x in twitter_in_english:
            #print(x)
            en_docs.append(x["text"])
        print(len(en_docs))
        for x in twitter_in_chinese:
            ch_docs.append(x["text"])

        return [en_docs,ch_docs]

    def produce_en_Cloud(self,region_type,region,num):
        #main page
        docs = self.getMainData(region_type,region)[0]
        print(len(docs))
        words_dump = []

        docs = docs_preprocessor(docs)

        for text in docs:
            #print(text)
            #features = text
            #print(features)
            words_dump = words_dump + text
        cloud = collections.Counter(words_dump).most_common(num)
        print(cloud)
        #json.dump(cloud,open("wordCloud.json","w",encoding="utf-8"))

        return cloud

    def produce_ch_Cloud(self,region_type,region,num):
        #main page
        docs = self.getMainData(region_type,region)[1]
        print(len(docs))
        words_dump = []

        for text in docs:
            features = nltk_tokenize(text)
            #print(features)
            words_dump = words_dump + features
        cloud = collections.Counter(words_dump).most_common(num)#返回一个元组数组
        print(cloud)
        #json.dump(cloud,open("wordCloud.json","w",encoding="utf-8"))

        return cloud

    def produce_cloud(self,region_type,region,num):
        en_cloud = self.produce_en_Cloud(region_type,region,15)
        ch_cloud = self.produce_ch_Cloud(region_type,region,15)
        cloud = en_cloud + ch_cloud
        cloud = sorted(cloud,key=lambda t: t[1],reverse=True)#中英文词云排序
        return cloud[0:15]

cp = cloudProducer()
cloud = cp.produce_cloud("province","北京市",15)
print(cloud)

根据地理位置和关键词爬取twitter数据并生成词云

根据地理位置和关键词爬取twitter数据存入MongoDB并生成词云

tweepy获取数据

1. 建立model model.py

2. 访问百度地图接口根据经纬度拿到省市街道信息

3. 访问tweepy开放的接口爬取数据

生成词云（输入：地理位置和词云的个数，返回词云）

1. 访问数据库

2. 英文文本预处理

2. 中文文本预处理

2. 词云生成器

猜你喜欢

根据地理位置和关键词爬取twitter数据并生成词云

根据地理位置和关键词爬取twitter数据存入MongoDB并生成词云

tweepy获取数据

1. 建立model model.py

2. 访问百度地图接口根据经纬度拿到省市街道信息

3. 访问tweepy开放的接口爬取数据

生成词云 （输入：地理位置和词云的个数，返回词云）

1. 访问数据库

2. 英文文本预处理

2. 中文文本预处理

2. 词云生成器

猜你喜欢

生成词云（输入：地理位置和词云的个数，返回词云）