python爬虫：如何爬网页数据并将其放在文本文件中

不废话了，直接上代码：

主函数文件main.py

import OrphaNet

if __name__ == '__main__':


    print('+++++starting')

    myCrawlOrphaNet = OrphaNet.CrawlOrphaNet()
    myCrawlOrphaNet.test()

爬虫文件OrphaNet.py

说明：初始化，和download方法无需任何更改，但路径需要更改，注释标注且标红了的，且该路径下有一个links.txt文件

实现顺序为-->按行读取links.txt文件数据，打开网站（该网站地址需要Links.txt数据，没有该数据，网址为https://www.orpha.net/consor/cgi-bin/Disease_Search_List.php?lng=EN&TAG=，有数据才有之后的参数），然后将读取的数据存入该文件路径下，新建一个python.txt存放数据

links.txt可存放的数据为0,A,B,C,D....Z（注意应当每一个字母或者数字占一行）

import urllib.request
import re
import sys
import codecs

class CrawlOrphaNet():
    #初始化
    def __init__(self):
        super(CrawlOrphaNet, self).__init__()
        print('CrawlOrphaNet init')
    #输入网页URL，返回打开网页地址html
    def download(self, url, numRetries=2):
        print('download url:' + url)
        try:
            html = urllib.request.urlopen(url).read()
            print('download done')
            print(len(html))
        except urllib.request.URLError as e:
            print('download Error:' + e)
            html=None
            if numRetries > 0:
                return self.download(url,numRetries-1)

        return html
    #输入参数，实现遍历，打开文档中所有参数下网页,设置网页路径在这里：
    def readDisease(self, lineNum):
        print('readDisease lineNum: ' + str(lineNum))
        urlLinkTEST='https://www.orpha.net/consor/cgi-bin/Disease_Search_List.php?lng=EN&TAG='#该网站路径可以直接使用，但是注意后面需要添加一个参数
        html=self.download(urlLinkTEST+lineNum)
        folderPath = 'E:/PYTHONWorkSpace/crawl_web/www.orpha.net/data/'#注意路径，需要更改
        outputFilePath = folderPath +  'python.txt'
        print('readDisease write start')
        html = html.decode('ISO-8859-1')
        listhtml = re.findall("<a href='OC_Exp.php\?lng=EN&Expert=(.*?)'>", html)
        for i in listhtml :
            print("here---------------------  : %s"  %(i))
        try:
            file_object = open(outputFilePath, 'a')#'w'写入模式,'a'追加模式
            for i in listhtml:
                file_object.write(i+'\n')
            print('readDisease write done')
        finally:
            file_object.close()


    #读取txt文件中每一行数据，返回每一行数据（参数）
    def parseLinkFile(self, inputFilePath):
        print('parseLinkFile inputFilePath:' + inputFilePath)
        file_object = codecs.open(inputFilePath,encoding='UTF-8', errors='ignore')
        try:
            print('parseLinkFile read start')
            while 1:
                line = file_object.readline()
                print('line: ' + str(line.strip()))
                self.readDisease(str(line.strip()))
                if not line:
                    break
            print('parseLinkFile read done')
        finally:
            file_object.close()

    def test(self):
        print('test')
        folderPath = 'E:/PYTHONWorkSpace/crawl_web/www.orpha.net/data/'#注意路径需要更改
        inputFile = 'links.txt'#路径下有一个txt文件
        self.parseLinkFile(folderPath + inputFile))

运行试一下即可

python爬虫：如何爬网页数据并将其放在文本文件中

猜你喜欢