Java使用xpath获取58同城数据

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_35971258/article/details/86654299

Java使用xpath获取58同城数据

package common;
/**
 * 读取当当网下机械表的数据,并进行分析
 * sunwengang   2017-08-13  20:00
 */
import cn.wanghaomiao.xpath.model.JXDocument;
import com.jimi.house.common.utils.CheckString;
import com.jimi.house.modules.apartment.entity.ShareHouse;
import org.apache.log4j.Logger;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;

public class URLDemo {

    private static Logger logger = Logger.getLogger(URLDemo.class);

    public static void main(String args[]){
        //确定爬取的网页地址,此处为当当网搜机械表显示的网页
        //网址为        http://search.dangdang.com/?key=%BB%FA%D0%B5%B1%ED&act=input
        String strurl="https://3g.ganji.com/sz_zufang/36880262146970x.shtml?gjcity=sz&cookie=|||4160337230276824358417&apptype=12&fzbref=0&key=&pubid=58168182&params=rankjxzfbestm2099^desc&trackkey=36880262146970_86156089-e0f5-4d83-b2f1-56ff15f0f641_20190126092658_1548466018254&fcinfotype=gz&jingxuan=1";

        //建立url爬取核心对象
        try {
            URL url2 = new URL(strurl);
            HttpURLConnection conn = (HttpURLConnection) url2.openConnection();
            conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
            conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.6; Windows NT)");
            conn.setDoInput(true);
            conn.setInstanceFollowRedirects(true);
            InputStream is = conn.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));

            //按行读取并打印
            String line = null;
            StringBuilder builder = new StringBuilder();
            while ((line = br.readLine()) != null) {
                builder.append(line);
            }
            logger.error("【解析源url】:" + strurl);
            logger.error("【日志信息】" + builder);
            //创建对象
            //div[@id='post_list']/div[./div/div/span[@class='article_view']/a/num()>1000]/div/h3/allText()
            JXDocument jxDocument = new JXDocument(builder.toString());

            if(!strurl.contains("m.58.com")) {

                //获取标题
                String title = "";
                String xpath = "//div[@class='house-header cont-padding']/div[@class='house-header-left']/allText()";
                List<Object> rs = jxDocument.sel(xpath);
                logger.error("【原始标题】:" + rs);
                if (CheckString.isNotEmpty(rs)) {
                    title = (String) rs.get(0);
                    title = title.replaceAll("小区:", "");
                    logger.error("【处理之后的title】:" + title);
                }
                //获取描述
                String remark = "";
                String xpath3 = "//div[@class='configure']/p/allText()";//div[@class='configure']/p/allText()
                List<Object> titleList = jxDocument.sel(xpath3);
                logger.error("【原始描述】:" + titleList);
                if (CheckString.isNotEmpty(titleList)) {
                    remark = ((String) titleList.get(0)).replaceAll("联系我时,请说是在58同城上看到的,谢谢","");
                }

                //获取图片
                String xpath4 = "//div[@class='swiper-slide']/img/@src";
                List<Object> rs4 = jxDocument.sel(xpath4);
                logger.error("【原始图片】:" + rs4.size() + ":具体值:" + rs4);
                List<String> imgUrls = new ArrayList<>();
                if (CheckString.isNotEmpty(rs4)) {
                    for (int i = 0, size = rs4.size(); i < size; i++) {
                        if (i > 5) {
                            break;
                        }
                        imgUrls.add((String)rs4.get(i));
                    }
                    logger.error("【imgUrls的长度是】:" + imgUrls.size());
                }


                String leaseMode = "";
                logger.error("【房子模式】:" + CheckString.getLaseMode("村"));
                if (CheckString.isNotEmpty(title)) {
                    leaseMode = CheckString.getLaseMode(title);
                    logger.error("【处理之后的leaseMode】:" + leaseMode);
                }


            }
            br.close();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.out.println("hellow world!!!");
        }

    }
}

☛注意:使用allText()可以获取值所有的纯文本

String xpath = "//div[@class='house-header cont-padding']/div[@class='house-header-left']/allText()";

猜你喜欢

转载自blog.csdn.net/qq_35971258/article/details/86654299