0 8 reptiles began to learn and use requests beautifulsoup4 crawling links and Wikipedia entries stored in the database
Python uses requests and beautifulsoup4 crawling links and Wikipedia entries stored in the database
Reference documents:
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
# Installation beautifulsoup4
(pytools) D:\python\pytools>pip install beautifulsoup4
Install mysql module
pymysql address: https://github.com/PyMySQL/PyMySQL
Crawling Wikipedia entry
# coding=utf-8 from bs4 import BeautifulSoup import requests import re def spider_wike(): url = "https://en.wikipedia.org/wiki/Main_Page" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} resp = requests.get(url, headers = headers) # 将响应数据转换为utf-8编码 resp.encoding = 'utf-8' html_doc = resp.text Soup = the BeautifulSoup (html_doc, " html.parser " ) # found to wiki href attribute begins with a tag list_urls soup.find_all = ( " a " , the re.compile href = ( " ^ / wiki / " )) # Print (list_urls) # output of all entries corresponding to the name and the uRL of for url in list_urls: # filter out the .jpg or .JPG end of the uRL of IF not re.search (r " \ (JPG | JPG). " , url [ " href " ]): # Entry plus URL # Sting can only get a, get_text () can get all of the contents label Print (url.get_text (), " <------> " , " HTTPS: //en.wikipedia. ORG " + URL [ " the href " ]) IF the __name__ == ' __main__ ' : spider_wike ()
# The Wikipedia entry link into the database
# coding=utf-8 from bs4 import BeautifulSoup import requests import re import pymysql.cursors ''' # 环境准备 pip install pymysql create database wikiurl charset=utf8mb4; use wikiurl; create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000)); ''' url = "https://en.wikipedia.org/wiki/Main_Page" headers = {"User-Agent": "Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " } RESP = requests.get (URL, headers = headers) # convert the response data to utf -8 encoding resp.encoding = ' UTF-. 8 ' html_doc = resp.text Soup = the BeautifulSoup (html_doc, " html.parser " ) # found to wiki href attribute begins with a tag list_urls soup.find_all = ( " a " , = the re.compile the href ( " ^ / Wiki / " )) # Print (list_urls) # output of all entries corresponding to the name and URL for url in list_urls: # filter out the .jpg or .JPG end of the URL of IF not re.search (r " . \ (JPG | JPG) " , url [ " href " ]): # entries plus URL # Sting only Get a, get_text () retrieves all contents label Print (url.get_text (), " <------> " , " https://en.wikipedia.org " + URL [ " the href " ] ) Connection = pymysql.connect (Host = ' localhost ' , user= ' Root ' , password = ' root ' , db = ' wikiurl ' , charset = ' utf8mb4 ' ) the try : # Gets answer pointer with connection.cursor () AS the Cursor: # create sql statement sql = " INSERT INTO` urls` ( `urlname`,` urlhref`) values (% S,% S) " # Execute sql statement cursor.execute (sql, (url.get_text (), " https://en.wikipedia.org " + url [ " href " ])) # submit data Connection.commit () a finally : Connection.close ()
# Reads the information from the database entry
# coding=utf-8 import pymysql def get_conn(): connection = pymysql.connect(host='localhost', user='root', password='root', db='wikiurl', charset='utf8mb4') return connection def get_wiki_data(): Conn=get_conn () SQL = " SELECT urlname``, `urlhref` from URLs " CUR = conn.cursor () # Get the total number of records COUNT = cur.execute (SQL) Print (COUNT) # obtain all data # urllists = CUR .fetchall () # Get the specified item data # urllists cur.fetchmany = (. 3) # # for URL in urllists: # Print (URL [0], '<--->', URL [. 1]) # acquires a data = Link cur.fetchone () Print (Link) # close the connection conn.Close () DEF get_data(): conn = get_conn() try: with conn.cursor() as cur: sql = "select `urlname`,`urlhref` from urls where `id` is not NULL" count = cur.execute(sql) print(count) # 查询所有数据 # data = cur.fetchall() # print(data) # 查询指定条目数据 result = cur.fetchmany(size = 5) print(result) finally: conn.close() if __name__ == '__main__': # get_wiki_data() get_data()