大众点评评论标签替换文字,文字无法显示,无法复制。
思路构造标签对应文字库,爬取网页之后,自己通过文字库,自己处理。
不讲理论,只上代码,如下:(python3)
import math
import re
import requests
def get_css_and_px_dict(con):
find_datas = re.findall(r'(\.[a-zA-Z0-9-]+)\{background:(\-\d+\.\d+)px (\-\d+\.\d+)px', con)
css_name_and_px = {}
for data in find_datas:
span_class_attr_name = data[0][1:]
offset = data[1]
position = data[2]
css_name_and_px[span_class_attr_name] = [offset, position]
return css_name_and_px
def get_svg_urls(con):
find_datas = re.findall(
r'\[class\^=\"([a-zA-Z0-9-]+)\"\]{[a-zA-Z0-9/.\-: ;_]+\(([a-zA-Z0-9/.\-: ;_]+)\)[a-zA-Z0-9/.\-: ;_]+}', con)
css_name_and_px = {}
for data in find_datas:
url = data[1]
if url.startswith('//'):
url = 'http:' + url
css_name_and_px[data[0]] = url
return css_name_and_px
def download_content(url):
return requests.get(url).content.decode("utf-8")
def parse_result(x_src, y_src, svg_content):
find_datas = re.findall(r'<path id="(\d+)" d="([a-zA-Z0-9 ]+)"/>', svg_content)
data_mapping = {}
for d in find_datas:
target_id, item = d
_, y_position, _ = item.split(' ')
data_mapping[y_position] = target_id
font_size = re.findall(r'font-size:(\d+)px', svg_content)[0]
find_datas = re.findall(r'<textPath xlink:href="#(\d+)" textLength="\d+">(.*)</textPath>', svg_content)
data_warehouse = {}
for d in find_datas:
target_id, item = d
data_warehouse[target_id] = item
x_index = int(int(math.fabs(float(x_src)) / int(font_size)))
y = int(math.fabs(float(y_src))) + 23
target_id = data_mapping.get(str(y))
if target_id is None:
return
line = data_warehouse[target_id]
words = []
for w in line:
words.append(w)
word = words[x_index]
return word
def work(css_url):
con = download_content(css_url)
css_data = get_css_and_px_dict(con)
svg_urls = get_svg_urls(con)
print(svg_urls)
svg_contents = {}
result_dict = {}
for prefix, url in svg_urls.items():
svg_data = download_content(url)
svg_contents[prefix] = svg_data
for css_id, position in css_data.items():
for prefix, content in svg_contents.items():
if css_id.startswith(prefix):
x_src, y_src = position
word = parse_result(x_src, y_src, content)
result_dict[css_id] = word
print(result_dict)
def run():
css_url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/5de9a9098d8d30d7d65f16ab93871bb4.css'
work(css_url)
if __name__ == '__main__':
run()
每个网页都有一个css文件,拿该网页css文件作为入口url传入即可。
需要爬大众点评的请联系QQ:739848314,价低质优