Get potential customers' phone numbers and email addresses through bing

The company is engaged in the sensor industry, which belongs to the upstream enterprise, and many downstream manufacturers need to use sensors. With this background, a crawler is written to help the company obtain potential customer emails and telephone numbers. Here we take the keyword CO2 DETECTION as an example. If you need to obtain other products, only need to change keywords

1. Generate bing base_url based on keywords

import re


def get_bing_url(keywords):
    keywords = keywords.strip('\n')
    bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
    bing_url = re.sub(r'\s', '+', bing_url)
    return bing_url


if __name__ == '__main__':
    bing_url = get_bing_url('CO2 DETECTION')
    print(bing_url)

2. According to the bing page turning rules, simulate the bing page turning link

bing_url = get_bing_url(keywords.keywords)
    for i in range(1, 100):  # 通过for in来翻页
        print(i)
        time.sleep(random.randint(3, 5))
        if i == 1:
            url = bing_url
        else:
            url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'

3. Use selenium to simulate opening the link and get the source code of the website

I am using selenium here to simulate the browser to open and turn pages. Of course, you can also use requests (if there are a lot of visits, the crawled data is the same, so replace selenium)

        try:
            browser.set_page_load_timeout(100)  # 设置网页加载超时时间为20秒
            browser.get(url)
            cookie_ = browser.get_cookies()
            # browser.add_cookie(cookie_dict=cookie_)
            html = browser.page_source
        except Exception as e:
            with open('error.txt', 'a', encoding='utf-8') as f:
                f.write(str(e) + '\n')
            pass

4. Use xpath to get the source code of the website and extract the url from it

tree = etree.HTML(html)
            li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
            for li in li_list:
                try:
                    url_text = li.xpath('./div/a/@href')[0]
                    # print(url_text)
                except Exception as e:
                    with open('url.txt', 'a', encoding='utf-8') as f:
                        f.write(str(e) + url + '\n')
                    pass
                else:
                    domain_pattern = re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?')
                    domain_text = re.search(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?',url_text).group()
                    print(domain_text)
                    domain = Domain.objects.filter(domain=domain_text).exists()
                    if domain:
                        print('domain存在:' + url_text)
                        pass
                    else:
                        domain = Domain(domain=domain_text, keywords=keywords)
                        domain.save()
                    pass

In this case, the domain name in the regular acquisition link is used, because it is possible that a product has many different links, reducing the workload, and directly using the domain name to deduplicate

5. The complete code is as follows

import requests
import re
from lxml.html import etree
import os, sys
import django
from workhelp import settings
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random

sys.path.append('../../')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "workhelp.settings")
django.setup()
from infomation.models import Domain, KeyWords

chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument(
    'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
chrome_options.add_argument('--log-level=3')
browser = webdriver.Chrome(chrome_options=chrome_options)


def get_bing_url(keywords):
    keywords = keywords.strip('\n')
    bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
    bing_url = re.sub(r'\s', '+', bing_url)
    return bing_url


keywords_list = KeyWords.objects.all().filter(status=True)
for keywords in keywords_list:
    bing_url = get_bing_url(keywords.keywords)
    for i in range(1, 100):  # 通过for in来翻页
        print(i)
        time.sleep(random.randint(3, 5))
        if i == 1:
            url = bing_url
        else:
            url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
        try:
            browser.set_page_load_timeout(100)  # 设置网页加载超时时间为20秒
            browser.get(url)
            cookie_ = browser.get_cookies()
            # browser.add_cookie(cookie_dict=cookie_)
            html = browser.page_source
        except Exception as e:
            with open('error.txt', 'a', encoding='utf-8') as f:
                f.write(str(e) + '\n')
            pass
        else:
            tree = etree.HTML(html)
            li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
            for li in li_list:
                try:
                    url_text = li.xpath('./div/a/@href')[0]
                    # print(url_text)
                except Exception as e:
                    with open('url.txt', 'a', encoding='utf-8') as f:
                        f.write(str(e) + url + '\n')
                    pass
                else:
                    domain_pattern = re.compile(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?')
                    domain_text = re.search(r'[a-zA-Z0-9][-a-zA-Z0-9]{0,62}(\.[a-zA-Z0-9][-a-zA-Z0-9]{0,62})+\.?',url_text).group()
                    print(domain_text)
                    domain = Domain.objects.filter(domain=domain_text).exists()
                    if domain:
                        print('domain存在:' + url_text)
                        pass
                    else:
                        domain = Domain(domain=domain_text, keywords=keywords)
                        domain.save()
                    pass
    keywords.status = False
    keywords.save()

Briefly speaking about the above code, I used django to build the model, so I directly combined it with the django model. If you don't understand, you can leave a message and contact me directly.

6. After the domain name is extracted, the next step is to open the link, find the email address, phone number, and develop

As an SEOer who can write programs, we will definitely not do this. We continue to use the above method to directly simulate the browser to open, obtain the source code, save the database, and take a screenshot of the opened page, so that business personnel can distinguish whether it is a potential customer. This is just an idea, there are better ways to discuss together, the code is as follows

import os
import django
import re
import datetime
import uuid
import sys

sys.path.append('../../')
from workhelp import settings
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "workhelp.settings")
django.setup()


def replaceCharEntity(htmlstr):
    """
    替换常用HTML字符
    :param htmlstr: 要替换的字符
    :return:
    """
    CHAR_ENTITIES = {
    
    'nbsp': ' ', '160': ' ',
                     'lt': '<', '60': '<',
                     'gt': '>', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }
    re_charEntity = re.compile(r'&#?(?P<name>\w+);')
    sz = re_charEntity.search(htmlstr)
    while sz:
        entity = sz.group()  # entity全称,如>
        key = sz.group('name')  # 去除&;后entity,如>为gt
        try:
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
        except KeyError:
            # 以空串代替
            htmlstr = re_charEntity.sub('', htmlstr, 1)
            sz = re_charEntity.search(htmlstr)
    return htmlstr


def filter_tags(htmlstr):
    """
    过滤HTML中的标签
    :param htmlstr: 要过滤的内容
    :return:
    """
    re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)  # Script
    re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
    re_br = re.compile('<br\s*?/?>')  # 处理换行
    re_h = re.compile('</?\w+[^>]*>')  # HTML标签
    re_comment = re.compile('<!--[^>]*-->')  # HTML注释
    s = re_cdata.sub('', htmlstr)  # 去掉CDATA
    s = re_script.sub('', s)  # 去掉SCRIPT
    s = re_style.sub('', s)  # 去掉style
    s = re_br.sub('\n', s)  # 将br转换为换行
    s = re_h.sub('', s)  # 去掉HTML 标签
    s = re_comment.sub('', s)  # 去掉HTML注释
    # 去掉多余的空行
    blank_line = re.compile('\n+')
    s = blank_line.sub(' ', s)
    s = replaceCharEntity(s)  # 替换实体
    return s


from infomation.models import Domain,Info

while True:
    urls = Domain.objects.all().filter(is_cut=False)[:300]
    print(urls.count())
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument(
    #     'user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
    chrome_options.add_argument('--log-level=3')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    for url in urls:
        print(url.domain)
        pic_path = os.path.join(settings.MEDIA_ROOT, 'images', 'jietu')
        if not os.path.exists(pic_path):
            os.makedirs(pic_path)
        else:
            pass
        try:
            browser.set_page_load_timeout(100)  # 设置网页加载超时时间为20秒
            browser.get('http://'+url.domain)
            html = browser.page_source
            s = filter_tags(html)
            pic_name = '{}.{}'.format(url, '.png')
            browser.get_screenshot_as_file(os.path.join(pic_path, pic_name))

        except Exception as e:
            with open('error.txt', 'a', encoding='utf-8') as f:
                f.write(str(e) + '\n')
            pass
        else:
            pic_path = os.path.join('images', 'jietu', pic_name)
            url = Domain.objects.get(domain=url)
            url.image = pic_path
            url.html = html
            url.content = s
            url.is_cut = True
            url.save()
            # email_pattern = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}')
            # emails = email_pattern.findall(html)
            # new_emails = list(set(emails))
            # print(new_emails)
            # emails_str = ','.join(new_emails)
            # print(emails_str)
            # info = Info(email=emails_str, url=url)
            # info.save()

    browser.quit()

The last is to use regularization or other methods to analyze the source code and extract information from it

Guess you like

Origin blog.csdn.net/cll_869241/article/details/118053176