爬虫：阳光电影

#mysqlhelper.py
import pymysql

class MysqlHelper(object):
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='py1011', charset='utf8')
        self.cursor = self.db.cursor()

    def execute_modify_sql(self,sql, data):
        self.cursor.execute(sql, data)
        self.db.commit()

    def __del__(self):
        self.cursor.close()
        self.db.close()

if __name__ == '__main__':
    conn = MysqlHelper()
    # conn.execute_modify_sql('insert into lianjiaxinxi(title) VALUE (%s)', data=('huzeqi hehehe'))

import requests,re




# 目标网址
#
# for i in range(1,178):
        # 每一页的url
#     lurl = 'http://www.dytt8.net/html/gndy/dyzz/list_23_%s.html' % i
#      请求页面
#     response = requests.get(lurl)
#
#     html = response.text
#       # 正则匹配到详情页链接
#     movie_url_list = re.findall('<a href="(.*)" class="ulink"',html)
#       # 循环拼接详情页的链接
#     for movie_item in movie_url_list:
#         movie_url = 'http://www.dytt8.net'+movie_item
#         print(movie_url)

def getdetail(url):
    import mysqlhelper
    myhelper = mysqlhelper.MysqlHelper()
    sql = 'INSERT INTO yangguang (movie_title, movie_torrent) VALUES (%s, %s)'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'

    }
    response = requests.get(url,headers=headers)
    html = response.content.decode('gbk')
    # 电影详情页标题
    movie_title_name = re.search('<h1><font color=#07519a>(.*)</f',html)
    movie_title = movie_title_name.group(1)
    # 电影 磁力   magnet

    # movie_magnet_url = re.search('/><a href="(.*)"><str',html)
    # # print(movie_magnet.group(1))
    # if movie_magnet_url == None:
    #     pass
    # else:
    #     movie_magnet = movie_magnet_url.group(1)
    # except:
    #     movie_magnet_url = re.search('normal" href="(.*)"><str', html)
    #     # print(movie_magnet.group(1))
    #     movie_magnet = movie_magnet_url.group(1)
    # torrent种子
    movie_torrent_url = re.search('ddf"><a href="(.*)">ft',html)
    movie_torrent = movie_torrent_url.group(1)
    # print(movie_torrent.group(1))
    # 这个列表用来title
    movie_title_list = []
    movie_title_list.append(movie_title)

    # 这个列表两个下载的链接
    movie_down_url = []
    # movie_down_url.append(movie_magnet)
    movie_down_url.append(movie_torrent)
    movie_down_url_all = []
    movie_down_url_all.append(movie_down_url)

    data = (movie_title, movie_torrent)
    myhelper.execute_modify_sql(sql, data)

    print(movie_title)
    print(movie_torrent)

    # movie_dict = dict(zip(movie_title_list,movie_down_url_all))
    # print(movie_dict)





def getpage():

    for i in range(1,20):
        lurl = 'http://www.ygdy8.com/html/gndy/dyzz/list_23_%s.html' % i
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'

        }
        response = requests.get(lurl,headers=headers)

        html = response.text

        movie_url_list = re.findall('<a href="(.*)" class="ulink"',html)

        for movie_item in movie_url_list:
            movie_url = 'http://www.ygdy8.com'+movie_item
            getdetail(movie_url)


if __name__ == '__main__':
    getpage()

猜你喜欢