Use Jsoup package to grab Douban Top250 movie information

Java web page parsing tool Jsoup is mainly used to make crawler programs, while in Python is beautiful soup. Jsoup can obtain the model (dom) of web page files like JS and JQuery. It is a powerful tool for parsing web files. See how to use it: Detailed Jsoup (1)-Detailed Jsoup

The use of Jsoup is basically the same as JS, enter the Douban Top250 interface:


Find the link address:

String[] url=new String[25];
		//进入Top250的界面,抓取单个电影的网址
		Document document=Jsoup.connect("https://movie.douban.com/top250?start="+j*25).get();
        Elements bd=document.select("div.hd");
        for(int i=0;i<bd.size();i++)
        {
    
    
        	Elements info=bd.get(i).select("a[href]");
        	url[i]=info.attr("href");
        }
        return url;

In this section, we need a function to get all the information of the webpage after passing the URL to this function.
If the text is within the tag, after selecting this tag, the text method can extract the text. If it is not, for example, before <br>, such content belongs to the text node, you can find the neighboring node, and then use nextSibling() and other methods Get the text node. It should be noted that:
1. Document: document object model (dom)
2. Elements: collection of element objects
3. Node: node objects
These three are arranged from large to small. Normal node objects have no text method. Use the outhtml method to get it. The value, except for text nodes.

//抓取电影名
		String name=document.select("span[property=v:itemreviewed]").text();
		info[0]=name;
		//抓取导演名
		String director=document.select("a[rel='v:directedBy']").text();
		info[1]=director;
		//抓取编剧
		String pl=document.select("span.attrs").get(1).text();
		info[2]=pl;
		//抓取演员名
		String actor=document.select("span.attrs").get(2).text();
		info[3]=actor;
		//抓取电影类型
		String type=document.select("span[property='v:genre']").text();
		info[4]=type;
		//对是否含有官方网站进行讨论
		int i=4;
		if(document.select("span.pl").get(i).text().contains("官方网站"))
		{
    
    
			i++;
		}
		//抓取产地国家,它的内容是国家标签的下一个节点
		String country=document.select("span.pl").get(i).nextSibling().outerHtml();
		info[5]=country;
		//抓取语言
		String lan=document.select("span.pl").get(i+1).nextSibling().outerHtml();
		info[6]=lan;
		//抓取时长
		String runTime=document.select("span[property='v:runtime']").text();
		info[7]=runTime;
		//抓取别名
		String otherName=document.select("span.pl").get(i+4).nextSibling().outerHtml();
		info[8]=otherName;
		//抓取评价人数
		String peoNum=document.select("span[property='v:votes']").text();
		info[9]=peoNum;
		//抓取介绍
		String intro=document.getElementById("link-report").text();
		info[10]=intro;

Here is the process of capturing Douban movie information.




After capturing the information, we put it into a table. Here is the poi package, which is used to process documents, docx, xlsx, etc.

private static void writeRow(int i ,String[] row,File file) throws InvalidFormatException, IOException
	{
    
    
		Workbook workBook=new XSSFWorkbook(new FileInputStream(file));
		//获得Top250Sheet,前提是有这个sheet,没有会报错
		Sheet sheet=workBook.getSheet("Top250");
		Row r=sheet.createRow(i);
		for(int k=0;k<row.length;k++)
		{
    
    
			r.createCell(k).setCellValue(row[k]);
		}
		//将数据写进表格
		workBook.write(new FileOutputStream(file));
		
	}

To traverse each page, you can get the URLs of all movies. The
total code is as follows

package spider;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class DoubanTop250 
{
    
    
	
	public static void main(String[] args) throws InterruptedException, IOException, InvalidFormatException 
	{
    
    
		File file=new File("Top250.xlsx");
		//写好表格标题
		String[] title= {
    
    "电影名","导演","编剧","演员","类型","国家","语言","时长","别名","评价人数","简介"};
		writeRow(0,title,file);
		//遍历,抓取Top250电影信息
		for(int k=0	;k<10;k++) {
    
    
			//抓取每页所有电影的网址
		String[] url=getMovieUrl(k);
//		String item=url[0];
//		System.out.println(getInfos(item)[8]);
		for(int i=0;i<=24;i++)
		{
    
    
			String[] list =getInfos(url[i]);
			//防止豆瓣把你IP禁了
			Thread.sleep(500);
			writeRow(i+1+k*25,list, file);
		}
		}
	}
	//抓取每个电影的链接的函数
	private static String[] getMovieUrl(int j) throws IOException
	{
    
    
		String[] url=new String[25];
		//进入Top250的界面,抓取单个电影的网址
		Document document=Jsoup.connect("https://movie.douban.com/top250?start="+j*25).header("user-agent", "Chrome").get();
        Elements bd=document.select("div.hd");
        for(int i=0;i<bd.size();i++)
        {
    
    
        	Elements info=bd.get(i).select("a[href]");
        	url[i]=info.attr("href");
        }
        return url;
	}
	//获取每个电影的信息,使用的是Jsoup包
	private static String[] getInfos(String url) 
	{
    
    
		//豆瓣电影这个界面十分复杂,情况很多
		try
		{
    
    
		Document document=Jsoup.connect(url).get();
		String info[]=new String[11];
		//抓取电影名
		String name=document.select("span[property=v:itemreviewed]").text();
		info[0]=name;
		//抓取导演名
		String director=document.select("a[rel='v:directedBy']").text();
		info[1]=director;
		//抓取编剧
		int j=1;
		if(document.select("span.attrs").size()>2) {
    
    
		String pl=document.select("span.attrs").get(j).text();
		info[2]=pl;
		//抓取演员名
		String actor=document.select("span.attrs").get(j+1).text();
		info[3]=actor;
		}else if(document.select("span.attrs").size()==2)
		{
    
    
			info[2]=null;
			info[3]=document.select("span.attrs").get(j).text();
		}else
		{
    
    
		    info[2]=null;
		    info[3]=null;
		}
		//抓取电影类型
		String type=document.select("span[property='v:genre']").text();
		info[4]=type;
		//对是否含有官方网站进行讨论
		int i=4;
		if(document.select("span.pl").get(i).text().contains("官方网站"))
		{
    
    
			i++;
		}
		//抓取产地国家,它的内容是国家标签的下一个节点
		String country=document.select("span.pl").get(i).nextSibling().outerHtml();
		info[5]=country;
		//抓取语言
		String lan=document.select("span.pl").get(i+1).nextSibling().outerHtml();
		info[6]=lan;
		//抓取时长
		String runTime=document.select("span[property='v:runtime']").text();
		info[7]=runTime;
		//抓取别名
		String otherName=document.select("span.pl").get(i+4).nextSibling().outerHtml();
		info[8]=otherName;
		//抓取评价人数
		String peoNum=document.select("span[property='v:votes']").text();
		info[9]=peoNum;
		//抓取介绍
		String intro=document.getElementById("link-report").text();
		info[10]=intro;
		return info;
		//出现异常,不要抛出,会导致程序中断
		}catch (IOException e) {
    
    
			// TODO: handle exception
			return null;
		}
	}
	
	//向表格中写数据,使用POI包
	private static void writeRow(int i ,String[] row,File file) throws InvalidFormatException, IOException
	{
    
    
		Workbook workBook=new XSSFWorkbook(new FileInputStream(file));
		//获得Top250Sheet,前提是有这个sheet,没有会报错
		Sheet sheet=workBook.getSheet("Top250");
		Row r=sheet.createRow(i);
		for(int k=0;k<row.length;k++)
		{
    
    
			r.createCell(k).setCellValue(row[k]);
		}
		//将数据写进表格
		workBook.write(new FileOutputStream(file));
		
	}
}

Effect picture

Guess you like

Origin blog.csdn.net/m0_47202518/article/details/108549784