python-pymysql-requests-beautifulsoup数据爬取mysql保存---java-mysql-spring_boot-jpa后台读取数据

python-爬虫

# import urllib.request
# import http.cookiejar
# import pymysql
# conn = pymysql.connect("localhost", "root", "123456", "test")
# cursor = conn.cursor()
# cursor.execute("DROP TABLE IF EXISTS employee")
# sql = """CREATE TABLE employee(first_name CHAR(20) NOT NULL,
#          last_name CHAR(20),
#          age INT,
#          sex CHAR(1))"""
# cursor.execute(sql)
# sqlInsert = """INSERT INTO employee(first_name,last_name,age,sex) VALUES('李白','白居易',20,'男')"""
# try:
#     cursor.execute(sqlInsert)
#     cursor.execute(sqlInsert)
#     conn.commit()
# except:
#     conn.rollback()
# conn.close()


# 爬虫
# import requests
# from bs4 import BeautifulSoup
# import pymysql
#
# # 本地数据库
# sql_host = 'localhost'
# # 数据库的用户名
# sql_user = 'root'
# # 数据库密码
# sql_password = '123456'
# # 数据的名
# sql_name = 'test'
# SQL_INSERT = """INSERT INTO user_data(author,page,sex,age,vote,content) VALUES(%s,%s,%s,%s,%s,%s)"""
#
# def download_page(http_url):
#     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
#     call_back = requests.get(http_url, headers=headers)
#     return call_back.text
#
#
# def get_page_content(html, page):
#     conn = pymysql.connect(sql_host, sql_user, sql_password, sql_name)
#     cursor = conn.cursor()
#     soup = BeautifulSoup(html, 'html.parser')
#     con = soup.find(id='content-left')
#     con_list = con.find_all('div', class_='article')
#     for item in con_list:
#         author = item.find('h2').string
#         content = item.find('div', class_='content').find('span').get_text()
#         stats = item.find('div', class_='stats')
#         vote = stats.find('span', class_='stats-vote').find('i', class_='number').get_text()
#         comments = stats.find('span', class_='stats-comments').find('i', class_='number').string
#         author_info = item.find('div', class_='articleGender')
#         if author_info is not None:
#             class_list = author_info['class']
#             age = author_info.string
#             if 'womenIcon' in class_list:
#                 sex = '女'
#             elif 'manIcon' in class_list:
#                 sex = '男'
#             else:
#                 sex = ''
#         else:
#             sex = ''
#             age = ''
#         # cursor.execute(SQL_INSERT, ("name","data","gg","sd","dd"))
#         cursor.execute(SQL_INSERT,(author,page,sex,age,vote,content))
#         conn.commit()
#         # conn.close()
#
# def main():
#     conn = pymysql.connect(sql_host, sql_user, sql_password, sql_name)
#     cursor = conn.cursor()
#     cursor.execute("""DELETE FROM user_data""")
#     conn.commit()
#     conn.close()
#     for i in range(1, 14):
#         http_url = 'https://qiushibaike.com/text/page/{}'.format(i)
#         html = download_page(http_url)
#         get_page_content(html,i)
#
#
# if __name__ == '__main__':
#     main()
import requests
from bs4 import BeautifulSoup
import pymysql

# 网页格式Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0  <div class="articleGender manIcon">20</div>
def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:62.0) Gecko/20100101 Firefox/62.0'}
    call_back = requests.get(url,headers = headers)
    return call_back.text
def get_page_content(html,page):
    soup = BeautifulSoup(html,'html.parser')
    content = soup.find(id='content-left')
    content_list = content.find_all('div',class_='article')
    for list_item in content_list:
        author_div = list_item.find('div',class_='author')
        author = author_div.find('h2').string
        author_info = author_div.find('div',class_='articleGender')
        if author_info is not None:
            info_list = author_info['class']
            age = author_info.string
            if 'manIcon' in info_list:
                sex = '男'
            elif 'womenIcon' in info_list:
                sex = '女'
            else:
                sex = ''
        else:
            age = ''
            sex = ''
        print(author,sex,age)

if __name__ == '__main__':
    url = 'https://www.baidu.com/'
    html = get_html(url)
    get_page_content(html,1)

java -spring_boo-mysql

数据类

@Entity
public class UserData {

  @Id
  private int id;
  private String author;
  private String page;
  private String sex;
  private String age;
  private String vote;
  private String content;
}
@Repository
public interface User extends JpaRepository<UserData,String> { }

接口控制器

@RestController
@RequestMapping(value = "/name")
public class Data {

    @Autowired
    private User user;

    @RequestMapping(value = "/data")
    public List<UserData> name() {
        return user.findAll();
    }

}

python-pymysql-requests-beautifulsoup数据爬取mysql保存---java-mysql-spring_boot-jpa后台读取数据

猜你喜欢