java爬虫代码示例

1.导入pom依赖:

      <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.12</version>
        </dependency>

        <!-- html解析器 -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>

2.代码示例:

package com.atguigu.gulimall.ware.controller;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Vector;

public class WordWrite{
    public static Vector<String> titles=new Vector<String>();//存放文章标题
    public static Vector<String> t_contents=new Vector<String>();//存放文章内容
    public static void getTitleAndUrl(String url) {
        try {
            Vector<String> t_urls=new Vector<String>();
            Document doc = Jsoup.connect(url).get();
           // Elements links=doc.getElementsByClass("wrap");
            Elements links=doc.getElementsByClass("b12c");
            //由于每篇文章标题都是在类名为zx-tl的div块里面,Java可以直接用getElementsByClass,当然这里也可以用select来写,select("div.zx-tl")
            Elements elements = links.select("p");
            for(Element element:elements){
                t_contents.add(element.text());
            }
          //  Elements elements1 = doc.getElements
           // t_contents.add(links.select("p"));
            Elements urls=links.select("p");//再缩短,标题和链接都在a标签里面
            for (Element link : urls){
                //拿到a标签里面的链接地址
                t_urls.add(link.attr("href"));//将每篇文章链接加入t_urls
                titles.add(link.text());//将每篇文章标题加入titles
            }
            for(String t_url:t_urls) {
                Document doc2=Jsoup.connect(t_url).get();
                //根据类属性去拿值
                Elements contents=doc2.getElementsByClass("artical-content");
                t_contents.add(contents.text());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void main(String arg[]) {
        try {
            /*Scanner input=new Scanner(System.in);
            System.out.println("输入在维科网搜索的关键词:");
            String keyword=input.next();
            System.out.println("输入想要查找的前几页(如果输入2,即找前2页的):");
            String pagenumber=input.next();
            String txt_name="关键词:"+keyword+"前"+pagenumber+"页具体内容.txt";*/
            File file=new File("F:\\test1\\demo\\test2.txt");
           // int pagenum=Integer.parseInt(pagenumber);
           /* for(int i=1;i<=pagenum;i++) {
                String page=Integer.toString(i);
                String url="http://www.ofweek.com/newquery.action?keywords="+keyword+"&type=1&pagenum="+page;
            }*/
           //这里传入你需要爬取的网站信息
            getTitleAndUrl("http://www.ofweek.com/newquery.action?keywords=");
            if(!file.exists()){
                file.createNewFile();
            }
            FileWriter fileWriter = new FileWriter(file.getAbsoluteFile());
            BufferedWriter bw = new BufferedWriter(fileWriter);
            for(int i=0;i<t_contents.size();i++) {
                bw.write(i);
                bw.write("\n");
                bw.write(t_contents.get(i));
                bw.write("\n\n");
            }
            bw.close();
            System.out.println("txt文件已经成功记录!");
        }
        catch(IOException e) {
            e.printStackTrace();
        }
    }
}

猜你喜欢

转载自blog.csdn.net/miachen520/article/details/129428479