python爬虫技术细节合集

1文件处理pathlib

1.1文本文件读写

from pathlib import Path

 

if __name__ == '__main__':

    p = Path('..')

    print(p)

    print([x for x in p.iterdir() if x.is_dir()])

 

    # assign file with path, not exist

    q = Path('../data/tmp')

    if q.exists() == 0:

        # not exist case

        print(q.name, 'is not exist!')

 

        # create empty file

        q.touch()

        print(q.name, q.stat().st_size)

 

        # write text into file

        q.write_text('werwqer')

        print(q.name, q.stat().st_size)

 

    else:

        # exist case

        print(q.name, 'is exist, delete it first!')

        if q.stat().st_size == 0:

            # file size 0

            # 删除文件

            q.unlink();

        else:

            print(q.read_text())

            q.unlink();

 

1.2二进制文件读写

from pathlib import Path

 

current_path = Path.cwd()

print('current path ', current_path)

local_download_path = current_path / 'output/SourceCode' / item['project_name']

# local_download_path = Path(local_download)

# 创建嵌套目录

local_download_path.mkdir(parents=True, exist_ok=True)

 

local_file = item['file_name']

local_download_file = local_download_path / local_file

print('local download file: ', local_download_file)

# print('is file: ', local_download_file.is_file())

local_download_file.write_bytes(remote_file.content)

1.3创建目录

# local_download_path = Path(local_download)

# 创建嵌套目录

local_download_path.mkdir(parents=True, exist_ok=True)

 

2 url访问requests

2.1创建url请求

import requests

headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}

url = 'https://www.apache.org/dist/ant/'

sourceHTML = requests.get(url, headers = headers)

 

3 html解析lxml

3.1 etree元素

from lxml import etree

selector = etree.HTML(sourceHTML.text)

folder_list = selector.xpath('//pre[position()=1]/a[@href]')

for elmt in folder_list:

#

href_TT = elmt.get('href')

print('href_TT ', href_TT)

if href_TT[len(href_TT)-1] == '/':

      print('folder_list', elmt.attrib)

 

for x in result:

    # 获取某个tag的具体属性值,如class属性的名字

    # (方法1)使用x.get获取某属性值

    print('x: ', x.get('class'), x.get('id'), x.tag)

    # (方法2)使用x.attrib获取属性列表字典

    attributes = x.attrib

    print('x: ', attributes['class'])

    # (方法3)使用x.attrib获取属性列表,dict格式化.items方法返回所有属性信息

    d = dict(x.attrib)

    print('x: ', d.items())

3.2 elementTree元素

# 使用parse解析xml文件,返回elementTree元素,它加了很多xml文档的信息
pom_tree = etree.parse(str(pom_path))
# docinfo.xml_version是xml文档的xml版本信息
print('xml_version', pom_tree.docinfo.xml_version)
# elementTree元素的getroot方法返回element元素
root = pom_tree.getroot()
# tostring是将etree元素转化为字符串打印
print(etree.tostring(root,pretty_print=True))

4 XML解析

4.1 Xpath使用

1

//pre[position()=1]/a[@href]  xml中第一个pre元素下的包含href属性的a元素【列表】

2

//*[@id="content"]/div[@class="navigator-container "]/'

'div[@class="navigator-body"]/'

'div[@class="contained-content"]/'

'div[@class="navigator-group"]/'

'div[@class="results-panel navigator-item"]/'

'div/@data-issue-table-model-state'

Xml中id属性为content的所有元素中class属性为navigator-container (注意有空格)的div元素中的class属性为navigator-body的div元素中的class属性为contained-content的div元素中的class属性为navigator-group的div元素中的class属性为results-panel navigator-item的div元素中的div元素中的data-issue-table-model-state属性值

group_Id = plugins[i].xpath('ns:groupId',
                       namespaces={'ns': 'http://maven.apache.org/POM/4.0.0'})

xml文件带有xmlns,xpath解析需要带namespaces字典参数,定义xmlns值为字典值,再在xpath正则表达式的每个tag值前加上这个tag值。若不知道带的xmlns是什么,可以用for x in etree元素查找它的类型。

注:xpath语法很细,主要关注的地方是获取元素,获取属性,对于细化的如属性值包含字段,后缀规律,第几个元素等,建议python编码二次处理。

lxml文档:https://lxml.de/index.html

Xpath文档:http://www.w3school.com.cn/xpath/index.asp

4 文本格式化csv

4.1字典写

import csv

 

with open('names.csv', 'w', newline='') as csvfile:

    fieldnames = ['first_name', 'last_name']

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

 

    writer.writeheader()

    writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})

    writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})

4.2字典读

# 读csv文件

with open('names.csv', 'r', newline='') as csv_file:

    reader = csv.DictReader(csv_file)

    for row in reader:

        print('| '.join(row))

6 文本格式化json

6.1 json读

从字典读成json格式,明显的标记是‘变成了“

import json

 

json_data = {'name': 'angle', 'age': 30, 'sex': 'F',

                 'favorite': 'food, travel, climb',

                 'books': [{'bookName': 'bigDad1'}, {'bookName': 'bigDad2'}]}

 

# Serialize obj to a JSON formatted str "

print('json dumps:', json.dumps(json_data))

# pretty printing

print('json dumps:', json.dumps(json_data, indent=4))

6.2 json写

从json写入字典,明显的标记是“变成了‘

import json

 

# Deserialize s(a str)  to a Python object '

json_format = json.loads('{"__complex__": true, "real": 1, "image": 2}')

# traverse dict

for x in json_format:

    print(x, json_format[x])

7 ubuntu命令

7.1 无人值守批跑

nohup python -u xxx.py > xxx.log & 支持关终端后台运行写日志

 

猜你喜欢

转载自blog.csdn.net/xiexie1357/article/details/82699873