python + requests + re matching crawl Cat's Eye released movie information

python + requests to fetch the cat's eye in the movie was released, re regular match ranking acquire the corresponding movie, picture address, title, and starring showtimes and ratings

 

import requests
import re, json


def get_html(url):
    """
    Gets the page html source code
    :return:
    """
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
                 "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    # 浏览器信息
    headers = {
        "User-Agent": user_agent
    }
    r = requests.get(url, headers=headers)  
    html = r.text
    # print(html)
    return html


def parse_one_page(html):
    """
    Regular content needs to match
    :param html:
    :return:
    "" " 
    # Rank + pictures + address starring + + release time score 
    pattern = re.compile ( ' <dd>. *? Board-index. *?> (\ D +) </ i>. *? The Data-src = "(. *?)." *? name "> <a '+'
                          .*?> (. *?) </a>. *? Star"> (. *?) </ P>. *? releasetime "> (. *?) </ P> ' 
                         + ' . *? Integer"> (. *?) </ I>. *? fraction "> (. *?) </ I>. *? </ dd > ' , re.S)

    items = re.findall(pattern, html)

    for Item in items:
         yield {
             " rank " : Item [0],
             " Image Address " : Item [1 ],
             " title " : Item [2 ],
             " starring " : Item [3] .strip () [3 :],
             " release time " : Item [. 4] .strip () [. 4 :],
             " score " : Item [. 5] Item + [. 6 ]
        }


# Data storage

def write_file(content):
    with open("result.txt", 'a+', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + "\n")


def main():
    """
    The main function
    :return:
    """
    url = "http://maoyan.com/board/4"
    html = get_html(url)
    for item in parse_one_page(html):
        print(item)
        write_file(item)


if __name__ == '__main__':
    main()

 

Guess you like

Origin www.cnblogs.com/CesareZhang/p/11027772.html