Crawler project combat 11: Crawling Dangdang commodity information

aims

Crawl Dangdang's product information in batches and save it as a csv file to the local.

Project preparation

Software: Pycharm
third-party library: requests, fake_useragent, lxml, csv
Website address: http://search.dangdang.com/

Website analysis

Open the website page and search for a product, such as Doudou shoes.
Insert picture description here
can be seen:

http://search.dangdang.com/?key=%B6%B9%B6%B9%D0%AC&act=input

key=product name
Analyze whether it is a static webpage.
F12 opens the developer mode.
Copy keywords, Ctrl+U to view the source code. Paste it over and find it can be found.
Insert picture description here
Static pages.

Page number analysis

http://search.dangdang.com/?key=%B6%B9%B6%B9%D0%AC&act=input&page_index=1
http://search.dangdang.com/?key=%B6%B9%B6%B9%D0%AC&act=input&page_index=2
http://search.dangdang.com/?key=%B6%B9%B6%B9%D0%AC&act=input&page_index=3

It will change after the page_index is found.

Anti-climb analysis

Multiple accesses to the same ip address will face the risk of being blocked. Here, fake_useragent is used to generate random User-Agent request headers for access.

Code

1. Import the corresponding third-party library, define a class to inherit object, define the init method to inherit self, and the main function main to inherit self.

import requests
from fake_useragent import UserAgent
from lxml import etree
import csv
class dangdang(object):
    def __init__(self):
        self.url = 'http://search.dangdang.com/?key={}&page_index={}'
        ua = UserAgent(verify_ssl=False)
        for i in range(1, 100):
            self.headers = {
    
    
                'User-Agent': ua.random
            }
    def main(self):
    	pass
if __name__ == '__main__':
    spider = dangdang()
    spider.main()

2. Send a request to get the web page.

    def get_html(self,url):
        response=requests.get(url,headers=self.headers)
        html=response.text
        return html

3. Analyze the web page to obtain product information and save it locally.

    def parse_html(self,html):
        target=etree.HTML(html)
        titles=target.xpath('//p[@class="name"]/a/@title')
        prices=target.xpath('//p[@class="price"]/span/text()')
        links=target.xpath('//p[@class="name"]/a/@href')
        with open('F:/pycharm文件/document/dangdang.csv','a',newline='',encoding='gb18030') as f:
            csvwriter = csv.writer(f, delimiter=',')
            for title,price,link in zip(titles,prices,links):
                csvwriter.writerow([title,price,link])

4. Main function and function call.

    def main(self):
        product = str(input('请输入您要浏览的商品:'))
        end_page=int(input("要爬多少页:"))
        for page in range(1,end_page+1):
            url = self.url.format(product,page)
            print("第%s页。。。。"%page)
            html=self.get_html(url)
            self.parse_html(html)
            print("第%s页爬取完成" % page)

Effect display

Insert picture description here
Open the local file.
Insert picture description here
The complete code is as follows:

import requests
from fake_useragent import UserAgent
from lxml import etree
import csv
class dangdang(object):
    def __init__(self):
        self.url = 'http://search.dangdang.com/?key={}&page_index={}'
        ua = UserAgent(verify_ssl=False)
        for i in range(1, 100):
            self.headers = {
    
    
                'User-Agent': ua.random
            }
    def get_html(self,url):
        response=requests.get(url,headers=self.headers)
        html=response.text
        return html
    def parse_html(self,html):
        target=etree.HTML(html)
        titles=target.xpath('//p[@class="name"]/a/@title')
        prices=target.xpath('//p[@class="price"]/span/text()')
        links=target.xpath('//p[@class="name"]/a/@href')
        with open('F:/pycharm文件/document/dangdang.csv','a',newline='',encoding='gb18030') as f:
            csvwriter = csv.writer(f, delimiter=',')
            for title,price,link in zip(titles,prices,links):
                csvwriter.writerow([title,price,link])
    def main(self):
        product = str(input('请输入您要浏览的商品:'))
        end_page=int(input("要爬多少页:"))
        for page in range(1,end_page+1):
            url = self.url.format(product,page)
            print("第%s页。。。。"%page)
            html=self.get_html(url)
            self.parse_html(html)
            print("第%s页爬取完成" % page)
if __name__ == '__main__':
    spider = dangdang()
    spider.main()

Disclaimer: Only for your own study and reference use.

Guess you like

Origin blog.csdn.net/qq_44862120/article/details/107889276