Python3 web crawler data collection (4~6)

1. Download the specified file from the webpage

  • The urlretrieve() method directly downloads remote data to the local.
  • urlretrieve(url, filename=None, reporthook=None, data=None)
  • url-specifies where to download
  • finename-The local storage path is specified (If the parameter is not specified, urllib will generate a temporary file to save the data.)
  • reporthook-is a callback function, which will be triggered when the server is connected and the corresponding data block transmission is completed. We can use this callback function to display the current download progress.
  • data-refers to the data posted to the server. This method returns a (filename, headers) tuple containing two elements. Filename represents the path to the local storage, and header represents the response header of the server.
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "lxml")
# 找到图片的地址
imageLocation = bsObj.find("a", {
    
    "id": "logo"}).find("img")["src"]
# 下载图片并保存未logo.jpg
urlretrieve(imageLocation, "logo.jpg")

2. Download the file with the specified src tag

import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = "downloaded/"
baseUrl = "http://pythonscraping.com"

# 对URL链接进行清理和标准化,获得文件的绝对路径(而且去掉了外链)
def getAbsoluteURL(baseUrl, source):
    if source.startswith("http://www."):
        url = "http://" + source[11:]
    elif source.startswith("http://"):
        url = source
    elif source.startswith("www."):
        url = "http://"+source[4:]
    else:
        url = baseUrl+"/"+source
    if baseUrl not in url:
        return None
    return url

# 去除目录中的特殊符号
def correct_title(title):
    error_set = ['/', '\\', ':', '*', '?', '"', '|', '<', '>']
    for c in title:
        if c in error_set:
            title = title.replace(c, '')
    return title

# 获得下载目录
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace("www.", "")
    path = path.replace(baseUrl, "")
    path = correct_title(path)
    path = downloadDirectory+path
    # directory - 目录,用于检查该文件夹下是否已存在文件夹
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "lxml")
# 选择首页上所有带 src 属性的标签
downloadList = bsObj.findAll(src=True)
for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download["src"])
    if fileUrl is not None:
        print(fileUrl)
        try:
            urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
        except BaseException as e:
            print(str(e))
        else:
            continue

Three, save the data of the web page to CSV

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html, "lxml")
# 主对比表格是当前页面上的第一个表格
table = bsObj.findAll("table", {
    
    "class": "wikitable"})[0]
rows = table.findAll("tr")
csvFile = open("../files/editors.csv", 'wt', newline='', encoding='utf-8')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

Four, random walk

import  matplotlib.pyplot as plt
from random import choice

class RandomWalk():
    # 一个随机漫步数据的类

    def __init__(self, num_points = 5000):
        #初始化随机漫步的属性
        self.num_points = num_points

        # 所有随机漫步都始于(0, 0)
        self.x_values = [0]
        self.y_values = [0]

    def fill_walk(self):
        #计算随机漫步包含的所有点
        # 不断漫步,直到列表达到指定的长度
        while len(self.x_values) < self.num_points:
            # 决定前进方向以及沿这个方向前进的距离
            # x轴方向上 1 - 向右走 -1 - 向左走
            x_direction = choice([1, -1])
            x_distance = choice([0, 1, 2, 3, 4])
            #为零将垂直移动
            x_step = x_direction * x_distance
            # y轴方向上 1 - 向上走 -1 - 向下走
            y_direction = choice([1, -1])
            y_distance = choice([0, 1, 2, 3, 4])
            #为零将垂直移动
            y_step = y_direction * y_distance

            # 拒绝原地踏步
            if x_step == 0 and y_step == 0:
                continue

            # 计算下一个点的x和y值
            next_x = self.x_values[-1] + x_step
            next_y = self.y_values[-1] + y_step

            self.x_values.append(next_x)
            self.y_values.append(next_y)
# 只要程序处于活动状态,就不断地模拟随机漫步

# 创建一个RandomWalk实例,并将其包含的点都绘制出来
rw = RandomWalk()
rw.fill_walk()
# 设置绘图窗口的尺寸函数figure()用于指定图表的宽度、高度、分辨率和背景色。你需要给形参figsize指定一个元组,向matplotlib指出绘图窗口的尺寸,单位为英寸
plt.figure(figsize=(10, 6))
point_numbers = list(range(rw.num_points))
plt.scatter(rw.x_values, rw.y_values, c=point_numbers, cmap=plt.cm.Blues,
 edgecolor='none', s=1)
# 突出起点和终点
plt.scatter(0, 0, c='green', edgecolors='none', s=100)
plt.scatter(rw.x_values[-1], rw.y_values[-1], c='red', edgecolors='none',
            s=100)
# 隐藏坐标轴
plt.axes().get_xaxis().set_visible(False)
plt.axes().get_yaxis().set_visible(False)
plt.show()

Five, call API to view github Gaoxing Python language

import requests
# 执行API调用并存储响应
url = 'https://api.github.com/search/repositories?q=language:python&sort=stars'
r = requests.get(url)
print("Status code:", r.status_code)
# 将API响应存储在一个变量中
response_dict = r.json()
print("Total repositories:", response_dict['total_count'])
# 探索有关仓库的信息
repo_dicts = response_dict['items']
print("Repositories returned:", len(repo_dicts))

print("\nSelected information about each repository:")
for repo_dict in repo_dicts:
    print('\nName:', repo_dict['name'])
    print('Owner:', repo_dict['owner']['login'])
    print('Stars:', repo_dict['stargazers_count'])
    print('Repository:', repo_dict['html_url'])
    print('Description:', repo_dict['description'])

Guess you like

Origin blog.csdn.net/weixin_44485744/article/details/109314970