Actual combat 2, pycharm+python+MS SQLSERVER realizes the crawler program.
Realize python crawl the old Maotao website https://www.laomaotao.net/ u disk start all version numbers of the production tool, and then write them into the MS SQLSERVER database
Update later...
First, click Terminal in the lower left corner of pycharm, enter pip install requests and pip install BeautifulSoup4 installation package
1. Warm up first and write the first one to climb Baidu website.
import requests
from bs4 import BeautifulSoup
resp=requests.get('https://www.baidu.com') #Request Baidu home page
print(resp)
#Print the status code of the request result print(resp.content) #Print the source code of the requested web page
bsobj=BeautifulSoup(resp.content,'lxml') #Construct the webpage source code into a BeautifulSoup object for easy operation
a_list=bsobj.find_all('a') #Get all a tag objects in the webpage
text='' # Create an empty String
for a in a_list:
href=a.get('href') #Get the href attribute of the a tag object, that is, the link address pointed to by this object
text+=href+'\n' #Join the string, and wrap
with open('url.txt','w') as f: #Under the current path, open a file named'url.txt' by writing, if it does not exist, create
f.write(text) #in text Write the data into the text
2. Climbing the old Maotao website
The old Maotao website has anti-climbing, so I changed to another example of netizens and added the part of writing to the database.
The website we want to crawl http://www.weather.com.cn/weather/101190401.shtml
1. First create a new table t_tq in MS SQLSERVER
2. Select new project->pure python in pycharm IDE, create a new python file testtq.py on the project and copy the following code
import requests
import csv
import random
import time
import socket
import http.client
# import urllib.request
from bs4 import BeautifulSoup
import pymssql
def insert(results):
# Create your views here.
# 打开数据库连接
db = pymssql.connect(host='127.0.0.1', user='sa', password='3201319', database='bhjs', port=1433)
# 使用cursor()方法获取操作游标
cur = db.cursor()
sql = "INSERT INTO t_tq (rq,tq,zgwd,zdwd) VALUES (%s, %s, %s, %s)"
try:
cur.execute(sql,(results['rq'],results['tq'],results['zgwd'],results['zdwd'])) # 执行sql语句
#results = cur.fetchall() # 所有记录
db.commit()
except Exception as e:
raise e
return results
def get_content(url , data = None):
header={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
}
timeout = random.choice(range(80, 180))
while True:
try:
rep = requests.get(url,headers = header,timeout = timeout)
rep.encoding = 'utf-8'
# req = urllib.request.Request(url, data, header)
# response = urllib.request.urlopen(req, timeout=timeout)
# html1 = response.read().decode('UTF-8', errors='ignore')
# response.close()
break
# except urllib.request.HTTPError as e:
# print( '1:', e)
# time.sleep(random.choice(range(5, 10)))
#
# except urllib.request.URLError as e:
# print( '2:', e)
# time.sleep(random.choice(range(5, 10)))
except socket.timeout as e:
print( '3:', e)
time.sleep(random.choice(range(8,15)))
except socket.error as e:
print( '4:', e)
time.sleep(random.choice(range(20, 60)))
except http.client.BadStatusLine as e:
print( '5:', e)
time.sleep(random.choice(range(30, 80)))
except http.client.IncompleteRead as e:
print( '6:', e)
time.sleep(random.choice(range(5, 15)))
return rep.text
def get_data(html_text):
item_p={'rq':50,'tq':50,'zgwd':50,'zdwd':50}
final = []
bs = BeautifulSoup(html_text, "html.parser") # 创建BeautifulSoup对象
body = bs.body # 获取body部分
data = body.find('div', {'id': '7d'}) # 找到id为7d的div
ul = data.find('ul') # 获取ul部分
li = ul.find_all('li') # 获取所有的li
for day in li: # 对每个li标签中的内容进行遍历
temp = []
date = day.find('h1').string # 找到日期
temp.append(date) # 添加到temp中
item_p['rq']=date
inf = day.find_all('p') # 找到li中的所有p标签
temp.append(inf[0].string) # 第一个p标签中的内容(天气状况)加到temp中
item_p['tq']=inf[0].string
if inf[1].find('span') is None:
temperature_highest = None # 天气预报可能没有当天的最高气温(到了傍晚,就是这样),需要加个判断语句,来输出最低气温
else:
temperature_highest = inf[1].find('span').string # 找到最高温
temperature_highest = temperature_highest.replace('℃', '') # 到了晚上网站会变,最高温度后面也有个℃
temperature_lowest = inf[1].find('i').string # 找到最低温
temperature_lowest = temperature_lowest.replace('℃', '') # 最低温度后面有个℃,去掉这个符号
temp.append(temperature_highest) # 将最高温添加到temp中
temp.append(temperature_lowest) #将最低温添加到temp中
item_p['zgwd']=temperature_highest
item_p['zdwd']=temperature_lowest
final.append(temp) #将temp加到final中
insert(item_p)
return final
def write_data(data, name):
file_name = name
with open(file_name, 'a', errors='ignore', newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(data)
if __name__ == '__main__':
url ='http://www.weather.com.cn/weather/101190401.shtml'
html = get_content(url)
result = get_data(html)
# write_data(result, 'weather.csv')
3. Run, you can get the following results in SQLSERVER
Use the best tools to fight in a daze. ——Zeng Guofan
https://www.jetbrains.com/zh-cn/