Pycharm+python+MS SQLSERVER actual combat 2. Realize crawler program.

Actual combat 2, pycharm+python+MS SQLSERVER realizes the crawler program.

 

Realize python crawl the old Maotao website  https://www.laomaotao.net/    u disk start all version numbers of the production tool, and then write them into the MS SQLSERVER database

Update later...

First, click Terminal in the lower left corner of pycharm, enter pip install requests and pip install BeautifulSoup4 installation package

1. Warm up first and write the first one to climb Baidu website.

import requests
from bs4 import BeautifulSoup

resp=requests.get('https://www.baidu.com') #Request Baidu home page
print(resp)
#Print the status code of the request result print(resp.content) #Print the source code of the requested web page

bsobj=BeautifulSoup(resp.content,'lxml') #Construct the webpage source code into a BeautifulSoup object for easy operation
a_list=bsobj.find_all('a') #Get all a tag objects in the webpage
text='' # Create an empty String
for a in a_list:
    href=a.get('href') #Get the href attribute of the a tag object, that is, the link address pointed to by this object
    text+=href+'\n' #Join the string, and wrap
with open('url.txt','w') as f: #Under the current path, open a file named'url.txt' by writing, if it does not exist, create
    f.write(text) #in text Write the data into the text

2. Climbing the old Maotao website

The old Maotao website has anti-climbing, so I changed to another example of netizens and added the part of writing to the database.

The website we want to crawl  http://www.weather.com.cn/weather/101190401.shtml

1. First create a new table t_tq in MS SQLSERVER

2. Select new project->pure python in pycharm IDE, create a new python file testtq.py on the project and copy the following code

import requests
import csv
import random
import time
import socket
import http.client
# import urllib.request
from bs4 import BeautifulSoup

import pymssql

def insert(results):
    # Create your views here.
    # 打开数据库连接
    db = pymssql.connect(host='127.0.0.1', user='sa', password='3201319', database='bhjs', port=1433)

    # 使用cursor()方法获取操作游标
    cur = db.cursor()
    sql = "INSERT INTO t_tq  (rq,tq,zgwd,zdwd)  VALUES (%s, %s,  %s, %s)"

    try:
        cur.execute(sql,(results['rq'],results['tq'],results['zgwd'],results['zdwd'])) # 执行sql语句
        #results = cur.fetchall()  # 所有记录
        db.commit()
    except Exception as e:
        raise e

    return results

def get_content(url , data = None):
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235'
    }
    timeout = random.choice(range(80, 180))
    while True:
        try:
            rep = requests.get(url,headers = header,timeout = timeout)
            rep.encoding = 'utf-8'
            # req = urllib.request.Request(url, data, header)
            # response = urllib.request.urlopen(req, timeout=timeout)
            # html1 = response.read().decode('UTF-8', errors='ignore')
            # response.close()
            break
        # except urllib.request.HTTPError as e:
        #         print( '1:', e)
        #         time.sleep(random.choice(range(5, 10)))
        #
        # except urllib.request.URLError as e:
        #     print( '2:', e)
        #     time.sleep(random.choice(range(5, 10)))
        except socket.timeout as e:
            print( '3:', e)
            time.sleep(random.choice(range(8,15)))

        except socket.error as e:
            print( '4:', e)
            time.sleep(random.choice(range(20, 60)))

        except http.client.BadStatusLine as e:
            print( '5:', e)
            time.sleep(random.choice(range(30, 80)))

        except http.client.IncompleteRead as e:
            print( '6:', e)
            time.sleep(random.choice(range(5, 15)))

    return rep.text


def get_data(html_text):
    item_p={'rq':50,'tq':50,'zgwd':50,'zdwd':50}
    final = []
    bs = BeautifulSoup(html_text, "html.parser")  # 创建BeautifulSoup对象
    body = bs.body # 获取body部分
    data = body.find('div', {'id': '7d'})  # 找到id为7d的div
    ul = data.find('ul')  # 获取ul部分
    li = ul.find_all('li')  # 获取所有的li

    for day in li: # 对每个li标签中的内容进行遍历
        temp = []
        date = day.find('h1').string  # 找到日期
        temp.append(date)  # 添加到temp中
        item_p['rq']=date

        inf = day.find_all('p')  # 找到li中的所有p标签
        temp.append(inf[0].string)  # 第一个p标签中的内容(天气状况)加到temp中

        item_p['tq']=inf[0].string

        if inf[1].find('span') is None:
            temperature_highest = None # 天气预报可能没有当天的最高气温(到了傍晚,就是这样),需要加个判断语句,来输出最低气温
        else:
            temperature_highest = inf[1].find('span').string  # 找到最高温
            temperature_highest = temperature_highest.replace('℃', '')  # 到了晚上网站会变,最高温度后面也有个℃
        temperature_lowest = inf[1].find('i').string  # 找到最低温
        temperature_lowest = temperature_lowest.replace('℃', '')  # 最低温度后面有个℃,去掉这个符号
        temp.append(temperature_highest)   # 将最高温添加到temp中
        temp.append(temperature_lowest)   #将最低温添加到temp中

        item_p['zgwd']=temperature_highest
        item_p['zdwd']=temperature_lowest
        final.append(temp)   #将temp加到final中

        insert(item_p)

    return final


def write_data(data, name):
    file_name = name
    with open(file_name, 'a', errors='ignore', newline='') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(data)


if __name__ == '__main__':
    url ='http://www.weather.com.cn/weather/101190401.shtml'
    html = get_content(url)
    result = get_data(html)
   # write_data(result, 'weather.csv')

 

3. Run, you can get the following results in SQLSERVER

 

 

             Use the best tools to fight in a daze. ——Zeng Guofan

             https://www.jetbrains.com/zh-cn/

            

 

Guess you like

Origin blog.csdn.net/fanxiaoduo1/article/details/106599788