Python crawler summary 1

Python crawler summary 1

1 Note in regular matching:
import re
a='<div>index</div>'
word=re.findall('<div>(.*?)</div>',a)
print( word)

  where (.*?) is able to match basically all characters, but for exceptions across lines
such as
import re
a='''<div>abc
</div>'''
word=re.findall('<div> (.*?)</div>',a,re.S)
print(word)
Because findall matches line by line, when the first line does not match, it matches from the second line, so the last parameter uses re. S, identifies the characters that match including newlines;
, when crawling, generally clean up newlines, use
print(word[0].strip())

2 Simple example:
    import requests
import re

headers = {
    ' User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785. 143 Safari/537.36'
}

info_lists = []

def judgment_sex(class_name):
  if class_name == 'womenIcon':
      return '女'
  else:
      return  '男'

def get_info(url):
    res = requests.get(url)
    ids = re.findall('<h2>(.*?)</h2>',res.text,re.S)
    levels = re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S)
    sexs = re.findall('<div class="articleGender (.*?)">',res.text,re.S)
    contents = re.findall('<div class="content">.*?<span>(.*?)</span>',res.text,re.S)
    laughs = re.findall('<span class="stats-vote"><i class="number">(\d+)</i>',res.text,re.S)
    comments = re.findall('<i class="number">(\d+)</i> 评论',res.text,re.S)
    for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments):
        info = {
            'id':id,
            'level':level,
            'sex':judgment_sex(sex),
            'content':content,
            'laugh':laugh,
            'comment':comment
        }
        info_lists.append(info)

if __name__ == '__main__':
    urls = ['http://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,10)]
    for url in urls:
        get_info(url)
    for info_list in info_lists:
         f = open('d:/qiushi.txt','a+')
         try:
             f.write(info_list['id']+'\n')
             f.write(info_list['level'] + '\n')
             f.write(info_list['sex'] + '\n')
             f. write(info_list['content'] + '\n')
             f.write(info_list['laugh'] + '\n')
             f.write(info_list['comment'] + '\n\n')
             f. close()
         except UnicodeEncodeError:
          pass
        #print(info_list)


2 The API routine for calling related websites in python:
    import requests
import json
import pprint
address=input('Please enter the location')
par = {'address': address, 'key' : ' cb649a25c1f81c1451adbeca73623251'}
api = 'http://restapi.amap.com/v3/geocode/geo'
res = requests.get(api, par)
json_data = json.loads(res.text)
pprint.pprint(json_data)
   where pprint is a JSON formatted output tool, use JSON.LOAD to load the JSON result
body > div.main > div .content > div.main-image > p > a > img


3 MYSQL Based Crawling For

example , grabbing Douban TOP 250 movies
import requests
from lxml import etree
import re
import pymysql
import time

conn = pymysql.connect(host='localhost' , user='root', passwd='38477000', db='python', port=3309, charset='utf8')
cursor = conn.cursor()

headers = {
    'User-Agent':'Mozilla/5.0 ( Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

def get_movie_url(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
    for movie_href in movie_hrefs:
        get_movie_info(movie_href)

def get_movie_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
        director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
        actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
        actor = actors.xpath('string(.)')
        style = re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)[0]
        country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>',html.text,re.S)[0]
        release_time = re.findall('上映日期:</span>.*?>(.*?)</span>',html.text,re.S)[0]
        time = re.findall('片长:</span>.*?>(.*?)</span>',html.text,re.S)[0]
        score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
        cursor.execute(
            "insert into doubanmovie (name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",
            (str(name), str(director),str(actor), str(style), str(country), str(release_time), str(time), str(score)))
         conn.execute()
    except IndexError:
        pass

if __name__ == '__main__':
    urls = ['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
    for url in urls:
        get_movie_url(url)
        time.sleep(5)
    conn.commit()

4 多线程+异步抓取简书网7日最热:
    from lxml import etree
import requests
import re
import json
from multiprocessing import Pool



header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

def get_url(url):
    html = requests.get(url,headers=header)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="note-list"]/li')
    for info in infos:
        article_url_part = info.xpath('div/a/@href')[0]
        get_info(article_url_part)

def get_info(url):
    article_url = 'http://www.jianshu.com/' + url
    html = requests.get(article_url,headers=header)
    selector = etree.HTML(html.text)
    author = selector.xpath('//span[@class="name"]/a/text()')[0]

    print(author)
    article = selector.xpath('//h1[@class="title"]/text()')[0]
    print(article)
    date = selector.xpath('//span[@class="publish-time"]/text()')[0]
    print(date)
    word = selector.xpath('//span[@class="wordage"]/text()')[0]
    print(word)
    view = re.findall('"views_count":(.*?),',html.text,re.S)[0]
    print(view)
    comment = re.findall('"comments_count":(.*?),',html.text,re.S)[0]
    print(comment)
    like = re.findall('"likes_count":(.*?),',html.text,re.S)[0]
    print(like)
    id = re.findall('{"id":(.*?),',html.text,re.S)[0]
    gain_url = 'http://www.jianshu.com/notes/{}/rewards?count=20'.format(id)
    wb_data = requests.get(gain_url,headers=header)
    json_data = json.loads(wb_data.text)
    gain = json_data['rewards_count']

    include_list = []
    include_urls = ['http://www.jianshu.com/notes/{}/included_collections?page={}'.format(id,str(i)) for i in range(1,10)]
    for include_url in include_urls:
        html = requests.get(include_url,headers=header)
        json_data = json.loads(html.text)
        includes = json_data['collections']
        if len(includes) == 0:
            pass
        else:
            for include in includes:
                include_title = include['title']
                include_list.append(include_title)
    info ={
        'author':author,
        'article':article,
        'date':date,
        'word':word,
        'view':view,
        'comment':comment,
        'like':like,
        'gain':gain,
        'include':include_list
    }


if __name__ == '__main__':
    urls = ['http://www.jianshu. com/trending/weekly?page={}'.format(str(i)) for i in range(0, 11)]
    pool = Pool(processes=4)
    pool.map(get_url,urls)


4 form submission:
   use The routine of FORM form submission, the following is the
    import requests
import json
import time


of grabbing Lagou.com #client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#lagou = mydb['lagou']

headers = {
    'Cookie':'XXXXXX',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Connection':'keep-alive'
}

def get_page(url,params):
    html = requests.post(url, data=params, headers=headers)
    json_data = json.loads(html.text)
    print(json_data)
    total_Count = json_data['content']['positionResult']['totalCount']
    page_number = int(total_Count/15) if int(total_Count/15)<30 else 30
    get_info(url,page_number)

def get_info(url,page):
    for pn in range(1,page+1):
        params = {
            'first': 'true',
            'pn ': str (pn),
            ' kd ':' Python '
        }
        try:
            html = requests.post(url,data=params,headers=headers)
            json_data = json.loads(html.text)
            results = json_data['content']['positionResult']['result']
            for result in results:
                compangeName=result['companyFullName']
                print(compangeName)
                infos = {
                    'businessZones':result['businessZones'],
                    'city':result['city'],
                    'companyFullName':result['companyFullName'],
                    'companyLabelList':result['companyLabelList'],
                    'companySize':result['companySize'],
                    'district':result['district'],
                    'education':result['education'],
                    'explain':result['explain'],
                    'financeStage':result['financeStage'],
                    'firstType':result['firstType'],
                    'formatCreateTime':result['formatCreateTime'],
                    'gradeDescription':result['gradeDescription'],
                    'imState':result['imState'],
                    'industryField':result['industryField'],
                    'jobNature':result['jobNature'],
                    'positionAdvantage':result['positionAdvantage'],
                    'salary':result['salary'],
                    'secondType':result['secondType'],
                    'workYear':result['workYear']
                }
               # lagou.insert_one(infos)
                time.sleep(10)
        except requests.exceptions.ConnectionError:
            pass

if __name__ == '__main__' :
    url = 'https://www.lagou.com/jobs/positionAjax.json'
    params = {
        'first': 'true',
        'pn': '1',
        'kd': 'Python'
    }
    get_page(url ,params)

5 I found a good foreigner, an online tool for making word clouds, there are many styles, I recommend https://wordart. com/create, such as crawling SINA Weibo friend circle

    import requests
import json

headers = {
    'Cookie':'XXXX'
}

f = open('d:/weibo.txt','a+',encoding='utf-8')

def get_info(url,page):
    html = requests.get(url,headers=headers)
    json_data = json.loads(html.text)
    card_groups = json_data[0]['card_group']
    for card_group in card_groups:
        f.write(card_group['mblog']['text'].split(' ')[0]+'\n')

    next_cursor = json_data[0]['next_cursor']

    if page<50:
        next_url = 'https://m.weibo.cn/index/friends?format=cards&next_cursor='+str(next_cursor)+'&page=1'
        page = page + 1
        get_info(next_url,page)
    else:
        pass
        f.close()

if __name__ == '__main__':
    url = 'https://m.weibo.cn/index/friends?format=cards'
    get_info(url,1)


然后分词:
    import jieba.analyse
path = 'd:\weibo.txt'
fp = open(path,'r',encoding='utf-8')
content = fp.read()
try:
    jieba.analyse.set_stop_words('G:\python学习相关\stop_words_zh.txt')
    tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True)
    for item in tags:
        print(item[0]+'\t'+str(int(item[1]*1000)))
finally:
    fp.close()

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326137726&siteId=291194637