Reptile crawling form the airport site

Excel Hand estimate lose than I write fast reptile series

Airport website crawling taxi fare
https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html

Tag object

  • tag.attrs
  • has_attr()

Child nodes, objects and screening Tag encapsulated string type bs4

  • List comprehensions use
  • pop list () operation
  • tag.children list iterator tag.contents

.get_text () .strings and .stripped_strings use

  • get_text () string as a whole, not the elimination of the internal spaces
  • .strings
  • stripped_strings iterator into a list list ()
import numpy
import requests
import bs4
headers={'Accept':'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
         }
response=requests.get("https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html",headers=headers)
#print(response.text)
text=response.text
bs=bs4.BeautifulSoup(text,features='lxml')
#print(bs.prettify())
nodes=bs.find_all('tbody')


def has_attr_class(tag):
    return tag.has_attr('class')

def parse_table(node):
    #将子节点为bs4封装的字符串类对象过滤掉
    city=[child for child in node.children if child.string!=' ']
    array=[]
    for tr in city:
        if has_attr_class(tr):
            print("************出租车收费标准***************")
            print("表单字段名字为:")
            #根据tag中的标签,将tag对象中的string多个字符串,变为迭代器
            field=list(tr.stripped_strings)#去除空格和空行
            print(field,'\n')
            array.append(field)
            #field=tr.get_text().strip()
        else:
            value=list(tr.stripped_strings)
            #商圈和景点分开
            if len(value)==5:
                info=value.pop(0)
            print(value)
            array.append(value)
    print('----------------------------------------')
    return array

#原表中有两个表
#浦东机场一号航站楼和二号航站楼
import numpy
import requests
import bs4
headers={'Accept':'text/html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
         }
response=requests.get("https://www.shanghaiairport.com/pdjc/jcjt/index_43742.html",headers=headers)
#print(response.text)
text=response.text
bs=bs4.BeautifulSoup(text,features='lxml')
#print(bs.prettify())
nodes=bs.find_all('tbody')


def has_attr_class(tag):
    return tag.has_attr('class')

def parse_table(node):
    #将子节点为bs4封装的字符串类对象过滤掉
    city=[child for child in node.children if child.string!=' ']
    array=[]
    for tr in city:
        if has_attr_class(tr):
            print("************出租车收费标准***************")
            print("表单字段名字为:")
            #根据tag中的标签,将tag对象中的string多个字符串,变为迭代器
            field=list(tr.stripped_strings)#去除空格和空行
            print(field,'\n')
            array.append(field)
            #field=tr.get_text().strip()
        else:
            value=list(tr.stripped_strings)
            #商圈和景点分开
            if len(value)==5:
                info=value.pop(0)
            print(value)
            array.append(value)
    print('----------------------------------------')
    return array

#原表中有两个表
#浦东机场一号航站楼和二号航站楼
global i
i=1
for node in nodes:
    array=parse_table(node)
    numpy.savetxt(r"E:\360MoveData\Users\hzsdl\Desktop\%d.txt" % i,array,delimiter=',',fmt='%s')
    i+=1

Guess you like

Origin www.cnblogs.com/lzycodinglife/p/12584851.html