建立成语库

# -*- coding: UTF-8 -*-
# tanj
# 2018-01-15
#filename grab_idioms.py

'''
抓取成语字典
http://cy.5156edu.com/cymore.html 在线成语词典
'''
import ConfigParser
import urllib2

import sys

import MySQLdb
from bs4 import BeautifulSoup

config = None
# get config
def getConfig():
    global config
    if config is None:
        config = ConfigParser.ConfigParser()
        config.read("config.ini")
        return config


# get database connect
def get_con():
    global config
    config = getConfig()
    mysql_host = config.get('localdb', 'host')
    mysql_port = config.get('localdb', 'port')
    mysql_user = config.get('localdb', 'user')
    mysql_passwd = config.get('localdb', 'password')
    mysql_db = config.get('localdb', 'database')
    mysql_charset = config.get('localdb', 'charset')
    config = None
    conn = MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd, db=mysql_db,
                           charset=mysql_charset)
    return conn


def executelist(sql_list):
    if len(sql_list) > 0:
        conn = get_con()
        cursor = conn.cursor()
        for sql in sql_list:
            try:
                print sql
                cursor.execute(sql)
            except Exception, e:
                print"mysql query error: %s", e
        cursor.close()
        conn.commit()
        conn.close()

pinyins=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
for pinyin_index in pinyins:
    print pinyin_index
    NUM = 1
    while True:
        if NUM == 1:
            page = ""
        else:
            page = "_" + str(NUM)
        url = "http://cy.5156edu.com/html2/" + pinyin_index + page + ".html"
        NUM = NUM + 1
        sql_list = []
        try:
            print url
            response = urllib2.urlopen(url)
            html = response.read()
        except Exception, e:
            print e
            break
        if html != None:
            try:
                # 更换编码格式
                details = html.decode("gbk").encode("utf-8")
            except Exception, e:
                print e
            try:
                # 更换编码格式
                details = html.decode("gb2312").encode("utf-8")
            except Exception, e:
                print e
            soup = BeautifulSoup(details)
            all_div = soup.select('table')[0].select('u')
            for row in all_div:
                print row.string
                if page == "":
                    page = "0"
                else:
                    page = page.replace("_", "")
                sql = "insert into idioms_dic(chengyu,pinyin_index,page) values('" + row.string.strip() + "','" + pinyin_index + "','" + page + "')"
                print sql
                sql_list.append(sql)
            try:
                executelist(sql_list)
            except Exception, e:
                print e
猜你喜欢