python爬虫:如何爬网页数据并将其放在文本文件中

不废话了,直接上代码:

主函数文件main.py

import OrphaNet

if __name__ == '__main__':


    print('+++++starting')

    myCrawlOrphaNet = OrphaNet.CrawlOrphaNet()
    myCrawlOrphaNet.test()

爬虫文件OrphaNet.py

说明:初始化,和download方法无需任何更改,但路径需要更改,注释标注且标红了的,且该路径下有一个links.txt文件

实现顺序为-->按行读取links.txt文件数据,打开网站(该网站地址需要Links.txt数据,没有该数据,网址为https://www.orpha.net/consor/cgi-bin/Disease_Search_List.php?lng=EN&TAG=,有数据才有之后的参数),然后将读取的数据存入该文件路径下,新建一个python.txt存放数据

links.txt可存放的数据为0,A,B,C,D....Z(注意应当每一个字母或者数字占一行)

import urllib.request
import re
import sys
import codecs

class CrawlOrphaNet():
    #初始化
    def __init__(self):
        super(CrawlOrphaNet, self).__init__()
        print('CrawlOrphaNet init')
    #输入网页URL,返回打开网页地址html
    def download(self, url, numRetries=2):
        print('download url:' + url)
        try:
            html = urllib.request.urlopen(url).read()
            print('download done')
            print(len(html))
        except urllib.request.URLError as e:
            print('download Error:' + e)
            html=None
            if numRetries > 0:
                return self.download(url,numRetries-1)

        return html
    #输入参数,实现遍历,打开文档中所有参数下网页,设置网页路径在这里:
    def readDisease(self, lineNum):
        print('readDisease lineNum: ' + str(lineNum))
        urlLinkTEST='https://www.orpha.net/consor/cgi-bin/Disease_Search_List.php?lng=EN&TAG='#该网站路径可以直接使用,但是注意后面需要添加一个参数
        html=self.download(urlLinkTEST+lineNum)
        folderPath = 'E:/PYTHONWorkSpace/crawl_web/www.orpha.net/data/'#注意路径,需要更改
        outputFilePath = folderPath +  'python.txt'
        print('readDisease write start')
        html = html.decode('ISO-8859-1')
        listhtml = re.findall("<a href='OC_Exp.php\?lng=EN&Expert=(.*?)'>", html)
        for i in listhtml :
            print("here---------------------  : %s"  %(i))
        try:
            file_object = open(outputFilePath, 'a')#'w'写入模式,'a'追加模式
            for i in listhtml:
                file_object.write(i+'\n')
            print('readDisease write done')
        finally:
            file_object.close()


    #读取txt文件中每一行数据,返回每一行数据(参数)
    def parseLinkFile(self, inputFilePath):
        print('parseLinkFile inputFilePath:' + inputFilePath)
        file_object = codecs.open(inputFilePath,encoding='UTF-8', errors='ignore')
        try:
            print('parseLinkFile read start')
            while 1:
                line = file_object.readline()
                print('line: ' + str(line.strip()))
                self.readDisease(str(line.strip()))
                if not line:
                    break
            print('parseLinkFile read done')
        finally:
            file_object.close()

    def test(self):
        print('test')
        folderPath = 'E:/PYTHONWorkSpace/crawl_web/www.orpha.net/data/'#注意路径需要更改
        inputFile = 'links.txt'#路径下有一个txt文件
        self.parseLinkFile(folderPath + inputFile))
运行试一下即可

猜你喜欢

转载自blog.csdn.net/qq_36187544/article/details/80320617
今日推荐