To obtain the data sales of the Internet, the old iron can learn if there is a need, not much nonsense, just go to the code!
Specially declare the following code, just for learning reference! Not for commercialization! Please declare the source for reprinting! Please respect the original!
sql statement
DROP TABLE IF EXISTS `cmf_huzhan`;
CREATE TABLE `cmf_huzhan` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`price` varchar(255) NOT NULL DEFAULT '1' COMMENT '状态;1:显示;0:不显示',
`title` varchar(255) NOT NULL COMMENT '友情链接评级',
`num` varchar(255) NOT NULL DEFAULT '0' COMMENT '排序',
`url` varchar(255) NOT NULL DEFAULT '' COMMENT '链接地址',
`status` varchar(30) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '' COMMENT '互站名称',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=56 DEFAULT CHARSET=utf8mb4 COMMENT='互站表';
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging
import time
import requests
import pymysql
from functools import reduce
from lxml import etree
import urllib3
urllib3.disable_warnings()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class ORM():
def __init__(self, table):
self.__table__ = table
def insert_string(self):
mappings = dict()
fields = []
params = []
# 将属性存放到dict中
for k, v in self.__dict__.items():
if k[0:1] == '_':
continue
if (type(v) == str):
v = "'" + pymysql.escape_string(v) + "'"
mappings[k] = v
fields.append(k)
params.append(v)
# params.append(getattr(self,k,''))
# params.append(getattr(self,k,v.default))
sql = 'insert into {} ({}) values ({})'.format(self.__table__, self.join(fields), self.join(params))
# print('SQL:%s'%sql)
return sql
# join函数,可以处理数字等非字符串
def join(self, attrs, pattern=','):
return reduce(lambda x, y: '{}{}{}'.format(x, pattern, y), attrs)
class Model(ORM):
def __init__(self, table):
super().__init__(table)
try:
# 获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库
self._db = pymysql.connect(host='localhost', port=3306, user='root',
passwd='root',
db='py',
charset='UTF8')
# autocommit=True 是否自动提交
# 使用cursor()方法创建一个游标对象
self._cur = self._db.cursor()
# 使用execute()方法执行SQL查询
self._cur.execute('SELECT VERSION()')
data = self._cur.fetchone()
for d in data:
# 注意int类型需要使用str函数转义
print('database version: %s' % data)
except Exception:
print("发生异常")
def select(self, limit=2000000):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
sql = "SELECT * FROM %s \
WHERE 1 limit %s" % (self.__table__, limit)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
data = []
for row in results:
data.append(row)
# 关闭数据库连接
self._db.close()
return data
def select_list(self):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
sql = "SELECT * FROM %s \
WHERE 1 " % (self.__table__)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
data = []
for row in results:
data.append(row)
# 关闭数据库连接
self._db.close()
return data
def select_sale(self, store_name, days):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
args = '%' + days + '%'
sql = "SELECT good_id,sale_num,update_time FROM %s \
WHERE 1 and good_id<>1 and sale_num<>0 and store_name='%s' and update_time like '%s'" % (
self.__table__, store_name, args)
# print(sql)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# print(len(results))
if len(results) == 0:
rows = 0
pass
else:
rows = results
# 关闭数据库连接
# self._db.close()
return rows
def select_jd(self):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
sql = "SELECT * FROM %s \
WHERE 1 and status =1 " % (self.__table__)
# print(sql)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
data = []
for row in results:
data.append(row)
# 关闭数据库连接
self._db.close()
# print(data)
return data
def tb_shop(self, goodid):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
start_times = time.strftime('%Y-%m-%d', time.localtime(time.time()))
start_times = '%' + start_times + '%'
sql = "SELECT company FROM %s WHERE good_id='%s' " % (
self.__table__, goodid)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchone()
# print(results)
if results == None:
row = 0
else:
row = results[0]
return row
def select_order(self):
# 使用cursor()方法获取操作游标
cursor = self._db.cursor()
# SQL 查询语句
sql = "SELECT * FROM `%s` where orderStatus='暂停' " % (self.__table__)
# print(sql)
# 执行SQL语句
cursor.execute(sql)
# 获取所有记录列表
results = cursor.fetchall()
# print(results)
# for row in results:
# print(row)
# datas = row
# 关闭数据库连接
self._db.close()
return results
def execute(self, sql):
self._cur.execute(sql)
data = self._cur.fetchone()
def insert(self):
cursor = self._db.cursor()
sql = self.insert_string()
# print(sql)
# cursor.execute(sql)
try:
# 执行sql语句
cursor.execute(sql)
insert_id = cursor.lastrowid
# 执行sql语句
self._db.commit()
return insert_id
except Exception:
print("insert发生异常" + sql)
# 发生错误时回滚
self._db.rollback()
def url_list():
url = 'https://www.huzhan.com/code/key/%E5%B0%8F%E8%AF%B4/order/sales/page/2'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Host': 'www.huzhan.com',
'Referer': 'https://www.huzhan.com/code/goods185419.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
data = requests.get(url, headers=headers).text
html = etree.HTML(data)
# /html/body/div[3]/div[1]/div[3]/div/dl[3]/dd/p[2]/a
title = html.xpath('//*/dd/p[2]/a/@title')
price = html.xpath('//*/dd/p[1]/em/strong/text()')
href = html.xpath('//*/dd/p[2]/a/@href')
for pa in range(0, 24):
http = 'https://www.huzhan.com'
hrefs = http + href[pa]
time.sleep(2)
total_s(hrefs,title[pa],price[pa],hrefs)
# print(title[pa], price[pa], hrefs)
def total_s(Referer,title,price,hrefs):
try:
s = requests.session()
print(Referer)
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
s.mount('http://', adapter)
s.mount('https://', adapter)
s.keep_alive = False
logging.captureWarnings(True) # ssl
urllib3.disable_warnings() # ssl
url = 'https://www.huzhan.com/apage/'
urls = Referer
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '43',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.huzhan.com',
'Origin': 'https://www.huzhan.com',
'Referer': urls,
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
headerss = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Host': 'www.huzhan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.12 Safari/537.36'
}
time.sleep(1)
da = requests.get(urls, headers=headerss).text
htmls = etree.HTML(da)
pro = htmls.xpath('//*/div[1]/span[1]/a/@id')[0]
# print(pro)
params = {
"list": "geva",
"pro": pro,
"good": "code",
"page": 0
}
start = time.time()
r = s.post("https://www.huzhan.com/apage/", data=params, headers=headers, verify=False).text
# eval 和json.loads 优先选择json.loads
html = json.loads(r)
print(str(title))
# print(title, price, hrefs, html['total'])
cmf_huzhan = Model('cmf_huzhan')
cmf_huzhan.title = str(title)
cmf_huzhan.num = html['total']
cmf_huzhan.price = price
cmf_huzhan.url = hrefs
good_ids = cmf_huzhan.insert()
# if (good_ids == None):
# raise Exception("插入异常 ", good_ids)
# print('完成')
#
end = time.time() - start
print(end)
except Exception:
pass
url_list()
The effect is as follows. The
summary
requires special attention! The problem is the post data request problem!
r = s.post("https://www.huzhan.com/apage/", data=params, headers=headers, verify=False).text