Information crawling all cells live off the designated city

In the process of crawling found, visit too frequently lead to a site verification pop-up slide, so time is set random time delay, this way we can guarantee the complete information crawling, I chose the Qingdao area, but also the follow-up enter city name can be added to crawl content, the average price of a house of two pages are dynamically generated, need to send a request to get a json, request url is more complex, but also resend request, thus directly in the primary page averaged price, and then passed two analytic function page, which can improve the efficiency of the code as follows:

"""
    爬取安居客所有小区信息
"""
import requests
from fake_useragent import UserAgent
from lxml import etree
import csv
import re
import time import random class AnjukeSpider(object): def __init__(self): self.url = 'https://qd.anjuke.com/community/p{}/' def get_headers(self): """ 构建请求头 :return: """ ua = UserAgent() headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "cookie": "aQQ_ajkguid=534DDCC9-5DBA-263A-CF4D-SX0716083828; isp=true; 58tj_uuid=e559fdad-fdb9-4a73-8c60-9e6e3bf82987; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1563237510; als=0; _ga=GA1.2.1881437242.1569052175; ctid=30; wmda_uuid=edd62dcc1e73bddc16beeb56087fd1f8; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; sessid=F6826357-F68F-1E17-B5A1-99FEA17341CA; lps=http%3A%2F%2Fwww.anjuke.com%2F%7Chttps%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DcuNIKoO-jX3CGzD7komT_lY2umPIHgZjjBdMMdFnpZHirHVPOLorVTafN32HS5R_%26ck%3D7150.2.84.414.190.439.289.917%26shh%3Dwww.baidu.com%26sht%3D02003390_42_hao_pg%26wd%3D%26eqid%3Dc2951ba5003c81ad000000065d881f86; twe=2; wmda_session_id_6289197098934=1569202063874-b62b0050-2be7-3851; _gid=GA1.2.388348263.1569202065; init_refer=https%253A%252F%252Fwww.baidu.com%252Flink%253Furl%253DcuNIKoO-jX3CGzD7komT_lY2umPIHgZjjBdMMdFnpZHirHVPOLorVTafN32HS5R_%2526ck%253D7150.2.84.414.190.439.289.917%2526shh%253Dwww.baidu.com%2526sht%253D02003390_42_hao_pg%2526wd%253D%2526eqid%253Dc2951ba5003c81ad000000065d881f86; new_uv=3; new_session=0", "referer": "https://qd.anjuke.com/community/?from=navigation", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": ua.random } return headers def get_link(self, url): """ 解析页面获取每个小区的二级页面链接和价格 :param url: :return: """ text = requests.get(url=url, headers=self.get_headers()).text html = etree.HTML(text) link = html.xpath("//h3/a/@href") price = html.xpath('//*[@id="list-content"]/div/div[2]/p[1]/strong/text()') print(link) print(price) for i in zip(link, price): print(i) return zip(link, price) defparse_message (self, url, price) : "" " two page parsing information required: param URL:: param. price:: return:" "" dict_result = { 'cell': '-', 'Address':' - ',' price ':' - ',' Property type: ':' - ',' property costs: ':' - ',' total construction area: ':' - ',' the total number of: ':' - ' ' construction Date: ':' - ' ,' parking spaces: ':' - ' ,' volume rate: ':' - ' ,' green rate: ':' - ' ,' developers: ':' - '' Property companies: ':' - ',' respective district: ':' - ',' number of second-hand housing listings: ':' - ',' rent number source: ':' - ',' related to school : ':' - ' } = requests.get text (URL URL =, = headers . self.get_headers ()) = HTML text etree.HTML (text) html.xpath table1 = ('/ Html / body / div [2] / div [3] / div [1] / h1 // text () ') # extract the names and addresses cell table1 = list (map (lambda item: re.sub (' \ s + ' , '', item), table1 )) # removed newline tab table1 = list (filter (None, table1)) # remove empty elements generated in the previous step dict_result [ 'cell'] = table1 [0] dict_result [ ' address'] = table1 [. 1 ] dict_result [ 'price'] =Table2 = html.xpath. price ( '// * [@ ID = "Basic-the infos-Box"] / DL // text ()' ) = Table2 List (Map (the lambda Item: the re.sub ( '\ + S', '' , Item), Table2)) = Table2 List (filter (None, Table2)) table2_list1 = Table2 [:: 2 ] table2_list2 = Table2 [. 1 :: 2 ] = table2_list3 ZIP (table2_list1, table2_list2) for J in table2_list3: dict_result [J [0]] = J [. 1 ] = #. price html.xpath ( '// * [@ ID = "Basic-the infos-Box"] / div [. 1] / span [. 1] / text ()' ) # price data in json file inside, so this is no way to match # dict_result [ 'price'] = price [0] table3 = html.xpath ( '// * [@ id = "basic-infos-box"] / div [2] // text () ' ) = table3 List (Map (the lambda Item: the re.sub (' \ S + ',' ' , Item), table3) = table3) List (filter (None,table3)) table3_list1 = table3[::2] table3_list2 = table3[1::2] Table3_list3 = ZIP (table3_list1, table3_list2) for J in table3_list3: dict_result [J [0]] = J [1 ] Print (dict_result) return dict_result DEF save_csv (Self, the Result): "" "The information is saved into the csv file: the Result param:: return: "" " headers = { 'community', 'address', 'price', 'Property type:', 'property costs:', 'total construction area:', 'the total number of:' , 'construction Date:' 'parking spaces:' 'volume rate:' 'green rate:' , 'developers:' 'Property companies:' 'respective district:', 'number of second-hand housing availability: ',' rent source number: ',' school related: ' } with Open (' Qingdao .csv ',' 'A' = NEWLINE ' AS F): Writer = csv.DictWriter (F, headers) # writer.writeheader () for Row in the Result:writer.writerow(row) def run(self): """ 主函数 :return: """ C = 1 for i in range(1, 101): # 总的272页 url = self.url.format(i) link = self.get_link(url) list_result = [] for j in link: try: result = self.parse_message(j[0], j[1]) time.sleep(round(random.randint(1, 3), C)) list_result.append(result) except Exception as err: print(err) self.save_csv(list_result) print("第%s页储存成功" % i) # url = 'https://qd.anjuke.com/community/view/875393?from=Filter_1&hfilter=filterlist' # self.parse_message(url) # self.get_link() if __name__ == '__main__': spider = AnjukeSpider() spider.run()

Guess you like

Origin www.cnblogs.com/lattesea/p/11746484.html