Python crawler example - save hot searches to specified txt file (with comments)

1. Purpose of the program

Crawl real-time hot searches and save them to a txt file named "Target List Deadline".

2. Precautions

1. Cookies are not given in the text

2. The target website code may change over time.

3. There are two types of output color fonts: coloema library and ANSI escape code. You can choose according to your needs.

3. Third-party library installation

You need to run the following code in cmd

pip install requests
pip install bs4
pip install colorama

4. Global variables

# 存放微博数据
weibo = []
# 返回一个 datetime 对象,表示当前的日期和时间 strftime用于格式化日期和时间的方法
time = datetime.datetime.now()
# 格式化处理 用于内容时间
formatted_time = time.strftime("%Y-%m-%d %H:%M:%S")
# 格式化处理 用于文件名时间
formatted_time_1 = time.strftime("%Y-%m-%d %H-%M-%S")

5. Get HTML

1. Get cookies

Specific method: Browser (Google/Edge) -> F12 -> Select application -> cookie

2. Code implementation 

def GetHTMLText(url):
    try:
        cookies = {
                       #cookies自行填入
        }
        r = requests.get(url, timeout=30, cookies=cookies)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""

4. Process text

Including ordinary text, pinned content, advertisements, and special lists

def fillData(soup):
    tr = soup.find_all('tr', class_="")
    n = 1
    for data in tr:
        # 查找当前 tr 标签下的a、span标签
        a = data.find('a')
        span = data.find('span')
        # 处理置顶内容
        i_1 = data.find('i', class_="icon-top")
        if i_1 is not None:
            # print("TOP %s" % a.string)
            # print("\033[32mTop %s \033[0m" % a.string)
            print(Fore.GREEN + "TOP %s" % a.string)
            # 保存置顶数据至列表weibo
            temp = ["TOP ", a.string]
            weibo.append(temp)
            continue
        # 广告处理
        ad = data.find('td', class_="td-01 ranktop")
        if ad is not None and ad.string == '•':
            continue
        if span is not None:
            # print("%2d、%s %s" % (n, a.string, span.string))
            # print("\033[33m%2d\\033[0m\033[36m%s\033[0m \033[37m%s\033[0m" % (n, a.string, span.string))
            print(Fore.YELLOW + "%02d、" % n, end='')
            print(Fore.CYAN + "%s " % a.string, end='')
            print(Fore.RESET + "%s" % span.string)
            # 保存榜单数据至列表weibo
            temp = ["%02d" % n, '、', a.string, ' |', span.string]
            weibo.append(temp)
        else:  # 特殊榜单处理
            # print("%d、%s" % (n, a.string))
            # print("\033[33m%2d、\033[0m\033[36m%s\033[0m" % (n, a.string))
            print(Fore.YELLOW + "%2d、" % n, end='')
            print(Fore.CYAN + "%s" % a.string, )
            # 保存数据至列表weibo
            temp = ["%02d" % n, '、', a.string]
            weibo.append(temp)
        n = n + 1

5. Initiate an HTTP request to obtain page content

def main():
    url = web_choice()
    html = GetHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    fillData(soup)

6. Process the selected list selection

def web_choice():
    os.system('cls')  # 清屏
    print("1-热搜榜\t\t2-要闻榜\t\t3-文娱榜\n4-体育榜\t\t5-游戏榜")
    choice = input("请输入你的选择:")
    web = {
        '1': {
            'add': 'summary',
            'name': '热搜榜'
        },
        '2': {
            'add': 'summary?cate=socialevent',
            'name': '要闻榜'
        },
        '3': {
            'add': 'summary?cate=entrank',
            'name': '文娱榜'
        },
        '4': {
            'add': 'summary?cate=sport',
            'name': '体育榜'
        },
        '5': {
            'add': 'summary?cate=game',
            'name': '游戏榜'
        },
    }
    html = 'https://s.weibo.com/top/' + web[choice]['add']
    os.system('cls')
    # print("\033[34m\t微博搜索-热搜榜\033"+'\033[34m %s\033' % (web[choice]['name']))
    print(Fore.LIGHTMAGENTA_EX + "\t微博搜索-" + '%s' % (web[choice]['name']))
    # 保存榜单名称至列表weibo
    weibo.append("微博搜索-" + web[choice]['name'])
    # 时间
    print(Style.RESET_ALL, end='')  # 重置所有样式
    print('截止于:', formatted_time)
    weibo.append('截止于:' + formatted_time)
    return html

7. Generate a file named "Target List Deadline" whose contents are list hot searches

with open(str(weibo[0]) + ' ' + str(formatted_time_1) + '.txt', 'w+', encoding='utf-8') as file:
    for data1 in weibo:
        for data2 in data1:
            file.write(data2)
        file.write('\n')
    # 检测文件状态
    fileName = str(weibo[0]) + ' ' + str(formatted_time_1) + '.txt'
    if os.path.isfile(fileName):
        filePath = os.path.abspath(fileName)
        # print("\033[36m文件已保存,路径为:+%s\033[0m" % file_path) # 青色
        print(Fore.LIGHTYELLOW_EX + "文件已保存,路径为:%s" % filePath)
    else:
        # print("\033[35m文件保存失败,请重新运行!\033[0m")
        print(Fore.MAGENTA + "文件保存失败,请重新运行!")

print(Style.RESET_ALL, end='')  # 重置所有样式

8. Page display

1. Main menu

2. List (take the game list as an example)

Guess you like

Origin blog.csdn.net/m0_62925086/article/details/131373241