Python3--爬取数据之911网站信息爬取

上代码:

#***************************************************
#这份代码用于从911网站上爬取信息
#其中的IP.txt文件为我本地存IP的文件
#
#***************************************************

import requests,csv
import pandas as pd
import time,random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

#利用pandas读取csv文件
def getNames(csvfile):
    data = pd.read_csv(csvfile)                   # 1--读取的文件编码问题有待考虑
    names = data['name']
    return names
    
'''
获取IP列表
事先已经在网上爬取了大量ip地址存入IP.txt中
IP地址的获取方法参照我的另一篇博客( Python--代理IP)
'''
def get_ip_list():  
    f=open('IP.txt','r')  
    ip_list=f.readlines()  
    f.close()  
    return ip_list  
  
#从IP列表中获取随机IP  
def get_random_ip(ip_list):  
    proxy_ip = random.choice(ip_list)  
    proxy_ip=proxy_ip.strip('\n')  
    proxies = {'http': proxy_ip}  
    return proxies

#爬取网页并返回所需信息以及状态码
def parsePage(url,ip_list):
    fuck_header= {'User-Agent':str(UserAgent().random)}
    proxies = get_random_ip(ip_list)
    try:
        r = requests.get(url, proxies=proxies, headers=fuck_header, timeout=10,verify=False)      #verify设置为False,Requests也能忽略对SSL证书的验证。
    except:                                                                                        #2--verify的说明有待补充
        print('运行错误,程序暂停20秒')
        time.sleep(20)
        fuck_header= {'User-Agent':str(UserAgent().random)}
        proxies = get_random_ip(ip_list)
        r = requests.get(url, proxies=proxies, headers=fuck_header, timeout=10, verify=False)
    if r.status_code == 200:                                    #状态码status_code为200代表爬取成功,为404则为未爬取到相关信息
        soup = BeautifulSoup(r.text, 'lxml')
        body = soup.find("div", class_="pp")
        contents = body.find_all('p')
        return r.status_code, contents
    else:  
        return r.status_code, None

def getDict(contents):
    namesChineseTransliteration = []        #中文音译
    namesGender = []        #名字性别
    namesFromLanguage = []      #来源语种
    namesMoral = []         #名字寓意
    namesImpression = []        #名字印象
    namesMeaning = []           #名字含义

    #get_text() 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回
    #[4:]:读取第五的字符以后的字符

    namesChineseTransliteration.append(contents[1].get_text()[4:])

    namesGender.append(contents[-5].get_text()[4:])

    namesFromLanguage.append(contents[-4].get_text()[4:]) 

    namesMoral.append(contents[-3].get_text()[4:]) 
     
    namesImpression.append(contents[-2].get_text()[4:])
   
    namesMeaning.append(contents[-1].get_text()[4:])
    
    str_row=namesChineseTransliteration+namesGender+namesFromLanguage+namesMoral+namesImpression+namesMeaning

    return str_row

#功能:将信息写入文件  
def write_file(filePath,row):    
    with open(filePath,'a+',encoding='utf-8',newline='') as csvfile:    
        spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)    
        spanreader.writerow(row)

if __name__ == "__main__":
    for i in range(26,27):
        names = getNames("{}.csv".format(i))                    #获取需要爬取文件的名字
        base_url = "https://myingwenming.911cha.com/"            #kzb--在地址后面追加了xm_
        ip_list = get_ip_list()
        for j in range(len(names)):
            url = base_url + names[j] + ".html"
            status_code, contents = parsePage(url,ip_list)          
            print(names[j], status_code)
            if status_code == 200:              #状态码为200爬取成功,状态码为404爬取失败
                str_row = getDict(contents)
                # row = names[j]+str_row
                temp = []
                temp.append(names[j])
                row = temp + str_row
                write_file("爬取成功的人名/new{}.csv".format(i),row)        
            else:
                continue


欢迎留言交流!

猜你喜欢

转载自blog.csdn.net/qq_38251616/article/details/79762907