新浪微博爬虫

没有登录的版本,爬的页面数量有限
主要是处理json的顺序,做一个记录


from lxml import etree


class Weibo(object):
    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
            "Cookie": "SINAGLOBAL=7599228186083.695.1499079704863; UM_distinctid=1650e0b9fd5fe-0be0aef851589a-737356c-100200-1650e0b9fd6460; login_sid_t=528fe9d40502191efa7a39c3570e2648; cross_origin_proto=SSL; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=ac3bb62966dad84dafa780689a4f7fc3; _s_tentry=www.baidu.com; wb_view_log=1366*7681; Apache=8910395867108.623.1535108015266; ULV=1535108015274:87:4:1:8910395867108.623.1535108015266:1534297139876; SCF=AuQUwf63PoPCuREHCy74Ls6fWNyx_TM0sEp-kb67np3Z2aYZUSXa1-yiaae_ba4wR3vkr3Wl7B8DHybqaQJXt24.; SUHB=0V5V55cALO8NM_; un=13913292465; wb_view_log_6108068337=1366*7681; TC-Page-G0=0cd4658437f38175b9211f1336161d7d; SUB=_2AkMsI2sVdcPxrAVQmPAVymPhb4pH-jyf9gLjAn7uJhMyAxh77lA3qSVutBF-XA8geeqirJoe-l4iNQQ2DTjSG_iC; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWYAY6v3pepGn0LLaK2PXs85JpVF0201K20eo.EShqf; UOR=www.baidu.com,vdisk.weibo.com,login.sina.com.cn; WBStorage=e8781eb7dee3fd7f|undefined",
            # "Host":"weibo.com",

        }
        self.proxies = {"https": "https://192.168.43.1:1800"}
        self.s = requests.Session()
        self.page_num = 1

    def get_url(self):
        return [
            "https://weibo.com/a/aj/transform/loadingmoreunlogin?" \
            "ajwvr=6&category=0&page={}&lefnav=0&cursor=".format(i)
            for i in range(20)]

    def parse_url(self, url):
        response = self.s.get(url, proxies=self.proxies, headers=self.headers).content.decode()
        # with open('c.txt','w') as f :
        #     f.write(response)
        content = json.loads(response)
        content=content['data']

        # content = json.dumps(content, indent=4, ensure_ascii=False)
        content=content.replace('\n','')
        content=content.replace('\r','')
        content=content.replace('\\','')
        content=content.replace('\u200b','')
        content=content.replace('\xa0','')





        # print('content:',content)
        #     return content
        # def xpath_content(self,content):
        try:
            content=etree.HTML(content)
        except Exception as e:
            return None


        msgs = content.xpath('//div[@class="UG_list_a"]|//div[@class="UG_list_b"] | //div[@class="UG_list_v2"] ')
        # msgs = content.xpath('//ul[contains(@class,"pt_ul clearfix")]/div')
        print(url)
        print(len(msgs))
        data = []
        for msg in msgs:
            item = {}
            item['content'] = msg.xpath(
                './/div/div[2]/h3/div/text() | .//div/h3/div/text()')
            print(item['content'])
            item['time'] = msg.xpath(
                './/div/div[2]/div/span[1]/text() |.//div/span[1]/text()')[0]
            item['author'] = msg.xpath(
                './/div[2]/div[1]/a[2]/span/text()| .//div[2]/a[2]/span/text()')[0]
            item['forward'] = msg.xpath(
                './/span/em[@class="W_ficon ficon_forward S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
            item['comment'] = msg.xpath(
                './/em[@class="W_ficon ficon_repeat S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
            item['praised'] = msg.xpath(
                './/span/em[@class="W_ficon ficon_praised S_ficon W_f16"]/following-sibling::em[1]/text()')[0]
            item['img_src'] = msg.xpath(
                './/div[contains(@class,"pic W_piccut_v")]/img/@src')
            # print(item)
            data.append(deepcopy(item))

        return data

    def save_data(self, data):
        if data:
            print('data:',data)
            with open('weibo.csv', 'a+') as f:
                for i in data:
                    print('单条记录:',i)
                    img_src="".join(i['img_src'])
                    c="".join(i['content'])
                    f.write(c + ',' +
                            i['author'] + ',' +
                            i['time'] + ',' +
                            i['forward'] + ',' +
                            i['praised'] + ',' +
                            i['comment'] + ',' +
                            img_src+'\r\n')

    def run(self):
        urls = self.get_url()
        for url in urls:
            data = self.parse_url(url)
            self.save_data(data)
            print('第{}页'.format(self.page_num))
            self.page_num += 1
if __name__ == '__main__':
    weibo=Weibo()
    weibo.run()
猜你喜欢