The company is engaged in the sensor industry, which belongs to the upstream enterprise, and many downstream manufacturers need to use sensors. With this background, a crawler is written to help the company obtain potential customer emails and telephone numbers. Here we take the keyword CO2 DETECTION as an example. If you need to obtain other products, only need to change keywords
1. Generate bing base_url based on keywords
import re
def get_bing_url(keywords):
keywords = keywords.strip('\n')
bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
bing_url = re.sub(r'\s', '+', bing_url)
return bing_url
if __name__ == '__main__':
bing_url = get_bing_url('CO2 DETECTION')
print(bing_url)
2. According to the bing page turning rules, simulate the bing page turning link
bing_url = get_bing_url(keywords.keywords)
for i in range(1, 100): # 通过for in来翻页
print(i)
time.sleep(random.randint(3, 5))
if i == 1:
url = bing_url
else:
url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
3. Use selenium to simulate opening the link and get the source code of the website
I am using selenium here to simulate the browser to open and turn pages. Of course, you can also use requests (if there are a lot of visits, the crawled data is the same, so replace selenium)
try:
browser.set_page_load_timeout(100) # 设置网页加载超时时间为20秒
browser.get(url)
cookie_ = browser.get_cookies()
# browser.add_cookie(cookie_dict=cookie_)
html = browser.page_source
except Exception as e:
with open('error.txt', 'a', encoding='utf-8') as f:
f.write(str(e) + '\n')
pass
4. Use xpath to get the source code of the website and extract the url from it
tree = etree.HTML(html)
li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
for li in li_list:
try:
url_text = li.xpath('./div/a/@href')[0]
# print(url_text)
except Exception as e:
with open('url.txt', 'a', encoding='utf-8') as f:
f.write(str(e) + url + '\n')
pass
else:
domain_pattern = re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?')
domain_text = re.search(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?',url_text).group()
print(domain_text)
domain = Domain.objects.filter(domain=domain_text).exists()
if domain:
print('domain存在:' + url_text)
pass
else:
domain = Domain(domain=domain_text, keywords=keywords)
domain.save()
pass
In this case, the domain name in the regular acquisition link is used, because it is possible that a product has many different links, reducing the workload, and directly using the domain name to deduplicate
5. The complete code is as follows
import requests
import re
from lxml.html import etree
import os, sys
import django
from workhelp import settings
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random
sys.path.append('../../')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "workhelp.settings")
django.setup()
from infomation.models import Domain, KeyWords
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument(
'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
chrome_options.add_argument('--log-level=3')
browser = webdriver.Chrome(chrome_options=chrome_options)
def get_bing_url(keywords):
keywords = keywords.strip('\n')
bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
bing_url = re.sub(r'\s', '+', bing_url)
return bing_url
keywords_list = KeyWords.objects.all().filter(status=True)
for keywords in keywords_list:
bing_url = get_bing_url(keywords.keywords)
for i in range(1, 100): # 通过for in来翻页
print(i)
time.sleep(random.randint(3, 5))
if i == 1:
url = bing_url
else:
url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
try:
browser.set_page_load_timeout(100) # 设置网页加载超时时间为20秒
browser.get(url)
cookie_ = browser.get_cookies()
# browser.add_cookie(cookie_dict=cookie_)
html = browser.page_source
except Exception as e:
with open('error.txt', 'a', encoding='utf-8') as f:
f.write(str(e) + '\n')
pass
else:
tree = etree.HTML(html)
li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
for li in li_list:
try:
url_text = li.xpath('./div/a/@href')[0]
# print(url_text)
except Exception as e:
with open('url.txt', 'a', encoding='utf-8') as f:
f.write(str(e) + url + '\n')
pass
else:
domain_pattern = re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?')
domain_text = re.search(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?',url_text).group()
print(domain_text)
domain = Domain.objects.filter(domain=domain_text).exists()
if domain:
print('domain存在:' + url_text)
pass
else:
domain = Domain(domain=domain_text, keywords=keywords)
domain.save()
pass
keywords.status = False
keywords.save()
Briefly speaking about the above code, I used django to build the model, so I directly combined it with the django model. If you don't understand, you can leave a message and contact me directly.
6. After the domain name is extracted, the next step is to open the link, find the email address, phone number, and develop
As an SEOer who can write programs, we will definitely not do this. We continue to use the above method to directly simulate the browser to open, obtain the source code, save the database, and take a screenshot of the opened page, so that business personnel can distinguish whether it is a potential customer. This is just an idea, there are better ways to discuss together, the code is as follows
import os
import django
import re
import datetime
import uuid
import sys
sys.path.append('../../')
from workhelp import settings
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "workhelp.settings")
django.setup()
def replaceCharEntity(htmlstr):
"""
替换常用HTML字符
:param htmlstr: 要替换的字符
:return:
"""
CHAR_ENTITIES = {
'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
re_charEntity = re.compile(r'&#?(?P<name>\w+);')
sz = re_charEntity.search(htmlstr)
while sz:
entity = sz.group() # entity全称,如>
key = sz.group('name') # 去除&;后entity,如>为gt
try:
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
sz = re_charEntity.search(htmlstr)
except KeyError:
# 以空串代替
htmlstr = re_charEntity.sub('', htmlstr, 1)
sz = re_charEntity.search(htmlstr)
return htmlstr
def filter_tags(htmlstr):
"""
过滤HTML中的标签
:param htmlstr: 要过滤的内容
:return:
"""
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) # 匹配CDATA
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style
re_br = re.compile('<br\s*?/?>') # 处理换行
re_h = re.compile('</?\w+[^>]*>') # HTML标签
re_comment = re.compile('<!--[^>]*-->') # HTML注释
s = re_cdata.sub('', htmlstr) # 去掉CDATA
s = re_script.sub('', s) # 去掉SCRIPT
s = re_style.sub('', s) # 去掉style
s = re_br.sub('\n', s) # 将br转换为换行
s = re_h.sub('', s) # 去掉HTML 标签
s = re_comment.sub('', s) # 去掉HTML注释
# 去掉多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub(' ', s)
s = replaceCharEntity(s) # 替换实体
return s
from infomation.models import Domain,Info
while True:
urls = Domain.objects.all().filter(is_cut=False)[:300]
print(urls.count())
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument(
# 'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
chrome_options.add_argument('--log-level=3')
browser = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
print(url.domain)
pic_path = os.path.join(settings.MEDIA_ROOT, 'images', 'jietu')
if not os.path.exists(pic_path):
os.makedirs(pic_path)
else:
pass
try:
browser.set_page_load_timeout(100) # 设置网页加载超时时间为20秒
browser.get('http://'+url.domain)
html = browser.page_source
s = filter_tags(html)
pic_name = '{}.{}'.format(url, '.png')
browser.get_screenshot_as_file(os.path.join(pic_path, pic_name))
except Exception as e:
with open('error.txt', 'a', encoding='utf-8') as f:
f.write(str(e) + '\n')
pass
else:
pic_path = os.path.join('images', 'jietu', pic_name)
url = Domain.objects.get(domain=url)
url.image = pic_path
url.html = html
url.content = s
url.is_cut = True
url.save()
# email_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}')
# emails = email_pattern.findall(html)
# new_emails = list(set(emails))
# print(new_emails)
# emails_str = ','.join(new_emails)
# print(emails_str)
# info = Info(email=emails_str, url=url)
# info.save()
browser.quit()
The last is to use regularization or other methods to analyze the source code and extract information from it