Chinadaily bilingual news crawling

Today, I need to crawl some bilingual materials temporarily

(not yet cleaned)

need to take full advantage of

The code below is to get the link of each bilingual news in the Chinadaily webpage. First, study the URL and webpage structure of these webpages, including page turning, which is usually the homepage URL plus _2, _3... and so on. So the following code just gets the link.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: bi_news.py
Author: ZhangHaiou([email protected])
Date: 2018/05/04
"""

import urllib
import re
import os

bi_urls = []
 def getHtml(url): #Read     web page content 
    page = urllib.urlopen(url)
    html = page.readlines()
    #print html
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
    
def geturl(html):    #Read the link needed in the web page 
    for line in html:
         if re.search( ' \<div class="mr10"\>\<a href="\d\d\d\d\ -\d\d/\d\d/content\_\d{4,}.htm" ' ,line):
             if re.search( ' \<div class="mr10"\>\<a href=" 2016\-\d\d/\d\d/content\_\d{4,}.htm" ' ,line):         #Just want to get the corpus after 2016       
                os._exit(0)
             else :
                url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
                print("http://language.chinadaily.com.cn/" + url[0])
                bi_urls.append("http://language.chinadaily.com.cn/" + url[0])

                
if __name__ == '__main__':        
    n = 1
    # os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html')
    # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html"))
    # '''
    while n:
        if(n < 2):
            html = getHtml("http://language.chinadaily.com.cn/columnist/columnist.html")
            
        elif(n > 1):
            html = getHtml("http://language.chinadaily.com.cn/columnist/columnist.html_" + str(n) + ".html" )
        geturl(html)
        n = n + 1

 

Execute python bi_news.py >url.txt to save the desired URL

url.txt content:

 

The next step is to simply crawl the web page content linked to each line in the url, and organize the news into folders by month. The file name is the last eight digits of each news link.

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: content.py
Author: ZhangHaiou([email protected])
Date: 2018/05/04
"""

import urllib
import re
import os
import sys
bi_urls = []
def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    #print html
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1
    
def geturl(html):
    for line in html:
        if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line):
            if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line):                
                os._exit(0)
            else:
                url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
                print(url)
                bi_urls.append(url)
def savefile(savepath, content):
    with open(savepath, "w") as fp:
        fp.write(content)
                
if __name__ == '__main__':        

    for line in open(sys.argv[1],'r'):
        content = "" 
        n = 1
         while n: #This loop is to not miss the news that needs to be turned 
            if n > 1 :
                htm = line + "_" + str(n)
            else:
                htm = line
            raw = getHtml(htm)
            
            if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页
                break
            print(htm)
            n = n + 1
            # for hang in raw:
                # if re.search('^\<p\>.*\<\/p\>',hang):
            content = content + raw
        date = re.findall(r'\d\d\d\d\-\d\d',line)[0]
        filename = re.findall(r ' \d{6,} ' ,line)[0]
         if  not os.path.exists(date):   #Whether there is a directory 
            os.makedirs(date)
        savefile(date + "/" + filename + ".txt" , content)
        
      

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325276367&siteId=291194637