python之爬取58二手房实例(初学者,见谅)

import requests
from lxml import etree
if __name__ == "__main__":
    url = "https://bj.58.com/ershoufang/"
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/536.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    page_text = requests.get(url=url,headers=headers).text
    # print(page_text)
    #进行数据解析
    html = etree.HTML(page_text)
    content_list = html.xpath("//ul[@class='house-list-wrap']/li")
    # print(content_list[0].xpath("./div[1]/a/img/text()"))
    # parser = etree.HTMLParser(encoding="utf-8")
    # tree = etree.parse(page_text,parser=parser)
    # content_list =tree.xpath("//ul[@class='house-list-wrap']/li")
    """
    遇到的问题:
              tree = etree.parse(page_text,parser=parser)
      File "src\lxml\etree.pyx", line 3521, in lxml.etree.parse
      File "src\lxml\parser.pxi", line 1859, in lxml.etree._parseDocument
      File "src\lxml\parser.pxi", line 1885, in lxml.etree._parseDocumentFromURL
      File "src\lxml\parser.pxi", line 1789, in lxml.etree._parseDocFromFile
      File "src\lxml\parser.pxi", line 1177, in lxml.etree._BaseParser._parseDocFromFile
      File "src\lxml\parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
      File "src\lxml\parser.pxi", line 725, in lxml.etree._handleParseResult
      File "src\lxml\parser.pxi", line 652, in lxml.etree._raiseParseError
    OSError: Error reading file 
        直接用etree.parse(page_text),读取从网上爬取的HTML,而不是从文件中读取会报错
        tree = etree.parse(page_text,parser=parser)
    解决方案:
        先使用etree.HTML(网上爬取的HTML)让其进行解析,然后再使用xpath()进行数据解析
        html = etree.HTML(page_text)
        content_list = html.xpath("//ul[@class='house-list-wrap']/li")
    问题二:选取结点问题
        . 选取当前节点。

        .. 选取当前节点的父节点。
    """
    # 储存到文件中
    fp =open('58二手房信息.txt','w',encoding="utf-8")
    y = int(0)
    for i in content_list:
        contents = i.xpath("./div[2]/h2//text()")
        content = str(contents[1]).replace(' ','')
        fp.write(content+'\n')
        print(y,"存入成功")
        y+= int(1)
    print("成功!")

猜你喜欢

转载自blog.csdn.net/weixin_45729594/article/details/109056515