微信公众号文章的爬取（搜狗微信搜索）

1.模拟浏览器登陆，获取cookies

2.request.get()带上cookies访问

3.反反爬虫（待定，目前是代理ip+休眠，搜狗模式：封ip+封cookie+重点关照[我这里有一句mmp一定要讲]）

附上勉强能用的代码（自己根据实际情况，选择代理ip和休眠时间）

PS：获取代理ip代码：gei_ip_pools在置顶文章里面

from selenium import webdriver
import requests
import time
from bs4 import BeautifulSoup
import re
from mysql_py import *
import threading
from urllib import request
from get_ip_pools import *
import random

#get cookie
def get_cookies():
    driver = webdriver.Chrome()
    driver.get("http://weixin.sogou.com/")

    driver.find_element_by_xpath('//*[@id="loginBtn"]').click()
    time.sleep(10)

    cookies = driver.get_cookies()
    cookie = {}
    for items in cookies:
        cookie[items.get('name')] = items.get('value')
    return cookie

#url = "http://weixin.sougou.com"
#response = requests.get(url,cookies = cookie)
#search = input("输入你想搜索的关键词")

#get total url
def get_total_url(url):
    if url.startswith("//"):
        url = "http:" + url
    elif url.startswith("/"):
        url = "http:/" + url
    else:
        url = url
    return url

#init header
header = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Connection':'keep-alive',
    'Host':'weixin.sogou.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
    }

#init proxys
alright_proxys = main_function()

#get total page num
def get_page_count(search,cookie):
    global header
    page_source = requests.get("http://weixin.sogou.com/weixin?query=%s&type=2&page=1"%search,cookies = cookie,headers = header,proxies = alright_proxys[random.randint(0,len(alright_proxys)-1)]).content
    bs_obj = BeautifulSoup(str(page_source,encoding = "utf-8"),"html.parser")
    #print(bs_obj)
    item_count_str = bs_obj.find("div",{"class":"mun"}).text
    pattern = re.compile(r'\d+')
    total_item_count = pattern.findall(item_count_str.replace(",",""))[0]
    page_count = int(int(total_item_count)/10)
    return page_count

#check path
def check_mkdir(path):
    if not os.path.exists(path):
        try:
            os.makedirs(path)
        except Exception:
            pass
        
#download img
def get_img(url,num,connect,cursor):
    global alright_proxys
    response = request.get(url,headers = header).content
    content = str(response,encoding = "utf-8")
    bs_obj = BeautifulSoup(content,"html.parser")
    img_list = bs_obj.findAll("img")
    count = 0
    for img in img_list:
        try:
            imgurl=get_total_url(img.attrs["data-src"])
            store_name = "%s"%url_num+"%s"%count
            path = r"C:\Users\Mr.Guo\Pictures\weixin"
            check_mkdir(path)
            #urllib.request.urlretrieve(imgurl,r"C:\Users\Mr.Guo\Pictures\weixin\%s.jpeg" %store_name)
            insert_into_table(connect,cursor,store_name,html)
            count += 1
            time.sleep(5)
        except Exception as e:
            pass

#main function
def main_fun(page_count,search,cookie,connect,cursor):
    global header
    for i in range(page_count):
        num = i
        page_source = requests.get("http://weixin.sogou.com/weixin?query=%s&type=2&page=%s"%(search,num + 1),cookies = cookie,headers = header,proxies = alright_proxys[random.randint(0,len(alright_proxys)-1)]).content
        bs_obj = BeautifulSoup(str(page_source,encoding = "utf-8"),"html.parser")
        url_list = bs_obj.findAll("div",{"class":"txt-box"})
        final_url_list = []
        for url in url_list:
            final_url_list.append(url.h3.a.attrs['href'])
        for url_num in range(len(final_url_list)):
            t = threading.Thread(target = get_img,args = (final_url_list[url_num],url_num,connect,cursor))
            #time.sleep(3)
            t.start()

-----------------------调用，一部分是淘宝店铺的，同样在置顶文章中------------------------分割线：两部分代码

from final_test import *
from mysql_py import *
import threading
from sougou_wechat import *
#choice
choice = input('''输入需要获取的数据:
                    a.淘宝店铺
                    b.公众号文章
                ''')
if choice == 'a':
    db,db_cursor = init_fun("taobao")
    check_tab_exist(db,db_cursor)
    href_list = get_item_href()
    for i in range(len(href_list)):
        start_url = href_list[i]
        get_shop_url(store_list,start_url)
        for shop_url in store_list:
            print(shop_url)
            t = threading.Thread(target = get_img_url,args = (shop_url,db,db_cursor))
            t.start()
            #t.join()#阻塞待定
        
elif choice == 'b':
    db,db_cursor = init_fun("weixin")
    check_tab_exist(db,db_cursor)
    my_cookie = get_cookies()    
    search = input("输入你想搜索的关键词")
    page_num = get_page_count(search,my_cookie)
    main_fun(page_num,search,my_cookie,db,db_cursor)

插入了对数据库的操作，可以自己选择性保留

微信公众号文章的爬取（搜狗微信搜索）

猜你喜欢