Python technology sharing: crawlers

web data acquisition

urllib module application

How to get webpage data through python

Do transcoding

Prepare web page material

Start httpd

Through the access log of apache, it is found that the login is made by python

Solve to add header information for urllib

import urllib.request as u
request = u.Request("http://192.168.86.11") #将网页地址添加到request实例(变量)
request.add_header("User-Agent","Mozilla/5.0 \
(Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0") #为该实例添加头部信息
response = u.urlopen(request) #通过urlopen打开实例(网页地址 和 头部信息)
html = response.read()
print(html)  #访问页面

Verify linux apache log

vim /var/log/httpd/access_log Check whether there is python information in the information record

Program to download pictures

import urllib.request as u
request = u.Request("http://192.168.86.11/style/\
u24020836931378817798fm170s6BA8218A7B2128178FA0A49F010080E2w.jpg") #图片地址
request.add_header("User-Agent","Mozilla/5.0 \
(Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
response = u.urlopen(request)
html = response.read() #读取的图片的2进制数据
#print(html)
with open("c:\\users\\allen\\desktop\\爬虫.jpg","wb") as f:
    f.write(html)

Need to convert the webpage information acquisition program to function mode

import urllib.request as u
​
url = "http://192.168.86.11"
​
def get_html(urladdr):
    "我的功能是获取主页的所有源码"
    request = u.Request(urladdr)
    request.add_header("User-Agent","Mozilla/5.0 \
    (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
    response = u.urlopen(request)
    html = response.read()
    return html
​
def get_imglist():
    "我的功能是将所有图片信息地址 做成一个大的列表"
    pass
​
def get_imgs():
    "我的功能是下载图片列表中的所有 图片信息 并保存图片"
    pass
​
html = get_html(url)
print(html)

How to match strings with regular

Single character match

"." matches any single character

>>> import re
>>> re.findall(".ood","I say Good not food")
['Good', 'food']
>>> re.findall(".ood","I say Good not food @ood")
['Good', 'food', '@ood']
>>> re.findall(".ood","I say Good not food  ood")
['Good', 'food', ' ood']
>>> re.findall(".ood","I say Good not food \nood")
['Good', 'food']
>>>

[] Match a single character one by one

>>> re.findall("[fn]ood","I say Good not food nood") #ood以f或者n链接 的字符串
['food', 'nood']
>>> re.findall("[^fn]ood","I say Good not food nood")#ood不是以f或者n链接的字符串 取反
['Good']
>>> re.findall("^[Gfn]ood","Good not food nood") #以G f n 开头的和ood链接的字符串匹配
['Good']
>>> re.findall("^[Gfn]ood","I say Good not food nood")
[]
>>>

\d matches a single 0-9

>>> re.findall("\d","How old are you? I am 36")
['3', '6']
>>> re.findall("\d\d","How old are you? I am 36")
['36']
>>>

\w matches a single character in the range 0-9a-zA-Z_

>>> re.findall("\w","How old are you? I am 36")
['H', 'o', 'w', 'o', 'l', 'd', 'a', 'r', 'e', 'y', 'o', 'u', 'I', 'a', 'm', '3', '6']
>>> re.findall("\w\w\w","How old are you? I am 36")
['How', 'old', 'are', 'you']
>>> re.findall("\w\w","How old are you? I_am 36")
['Ho', 'ol', 'ar', 'yo', 'I_', 'am', '36']
>>>

\s matches whitespace characters and spaces

>>> re.findall("\s","\tHow old are you?\r\n")
['\t', ' ', ' ', ' ', '\r', '\n']
>>>

A set of character matches

Match verbatim

>>> re.findall("allen","I am allen")
['allen']
>>> re.findall("allen","I am allenallen")
['allen', 'allen']
>>>

Match Verbatim | Split different strings

>>> re.findall("food|nood","I say Good not food nood")
['food', 'nood']
>>> re.findall("not|nood","I say Good not food nood")
['not', 'nood']
>>>

* Indicates that the first character on the left side appears 0 to infinite times

>>> re.findall("go*gle","I like google not ggle goooogle and gogle")
['google', 'ggle', 'goooogle', 'gogle']
>>>

+ Means that the first character on the left appears from 1 to infinite times

>>> re.findall("go+gle","I like google not ggle goooogle and gogle")
['google', 'goooogle', 'gogle']
>>>

? Means that the first character on the left appears 0 or 1 times

>>> re.findall("go?gle","I like google not ggle goooogle and gogle")
['ggle', 'gogle']

{} Specify the number of occurrences of the left adjacent character

>>> re.findall("go{2}gle","I like google not ggle goooogle and gogle")
['google']
>>> re.findall("go{1}gle","I like google not ggle goooogle and gogle")
['gogle']
>>> re.findall("go{1,4}gle","I like google not ggle goooogle and gogle")
['google', 'goooogle', 'gogle']
>>>

Complete the web test page image acquisition according to the above information

import urllib.request as u
import re

url = "http://192.168.86.11/" #结尾添加左斜杠

def get_html(urladdr):
    "我的功能是获取主页的所有源码"
    request = u.Request(urladdr)
    request.add_header("User-Agent","Mozilla/5.0 \
    (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
    response = u.urlopen(request)
    html = response.read()
    return html

def get_imglist(url,html):
    "我的功能是将所有图片信息地址 做成一个大的列表"
    imglist = [] #存储图片地址的一个容器列表
    bytsimglist = re.findall(b"style/\w{60}\.jpg",html)
    for i in bytsimglist: #因为图片地址不全而且是2进制字符串 因此 要进行拼接处理
        imgaddr = url+str(i,encoding='utf8') #拼接并且转换为字符串
        imglist.append(imgaddr) #将地址放入列表中
    return imglist
    

def get_imgs(imglist):
    "我的功能是下载图片列表中的所有 图片信息 并保存图片"
    num = 0 #为了图片名称进行自增
    for imgurl in imglist:
        num += 1
        data = get_html(imgurl)
        with open("%s.jpg" %num,"wb") as f: #图片名字会从1.jpg开始一直到54.jpg
            f.write(data)

html = get_html(url)
#print(html)
imglist = get_imglist(url,html)
#print(len(imglist))
get_imgs(imglist)

Buka comic website resource crawling

import urllib.request as u
import re
​
#url = "http://www.buka.cn/view/223172/65537.html"
#url = "http://www.buka.cn/view/223578/65537.html"
#url = "http://www.buka.cn/view/221784/65540.html"
url = "http://www.buka.cn/view/219792/65742.html"
​
def get_html(urladdr):
    "我的功能是获取主页的所有源码"
    request = u.Request(urladdr)
    request.add_header("User-Agent","Mozilla/5.0 \
    (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0")
    response = u.urlopen(request)
    html = response.read()
    return html
​
def get_imglist(url,html):
    "我的功能是将所有图片信息地址 做成一个大的列表"
    imglist = [] #存储图片地址的一个容器列表
    bytsimglist = re.findall(b"http://i-cdn.ibuka.cn/pics/\d+/\d+/\w+.jpg",html)
    #print(bytsimglist)
    for i in bytsimglist:
        imglist.append(str(i,encoding='utf8'))
    return imglist
    
​
def get_imgs(imglist):
    "我的功能是下载图片列表中的所有 图片信息 并保存图片"
    num = 0 #为了图片名称进行自增
    for imgurl in imglist:
        num += 1
        data = get_html(imgurl)
        with open("%s.jpg" %num,"wb") as f: #图片名字会从1.jpg开始一直到54.jpg
            f.write(data)
​
html = get_html(url)
#print(html)
imglist = get_imglist(url,html)
#print(imglist)
get_imgs(imglist)

Application of special symbols in regular matching

^ Indicates what starts with $ ends with

>>> re.findall('^I say',"I say Good not food")
['I say']
>>> re.findall('not food$',"I say Good not food")
['not food']
>>> re.findall('not Good$',"I say Good not food")
[]
>>>

\b Specify word boundary_ not a special symbol

>>> re.findall("allen","allen.com allen_123 allen.com")
['allen', 'allen', 'allen']
>>> re.findall("\ballen\b","allen.com allen_123 allen.com")
[]
>>> re.findall("\\ballen\\b","allen.com allen_123 allen.com")
['allen', 'allen']
>>>

Guess you like

Origin blog.csdn.net/GUDUzhongliang/article/details/108659553