初学者学爬虫可以玩玩我做的这个小demo,原理就是用了jsoup这个小玩意,数据源是房天下的数据,杭州的房价,总共爬了100页。
两个类,超简单的,一个是爬虫demo,另一个是简单的开发商类,里面存放了这个开放商有多少套房源,均价多少,总价多少,方便后面对所有开放商的均价做了个排行。
/**
* Created by Precious_Life on 2018/8/14.
*/
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.TreeMap;
public class JsoupBuildingTest {
public static int count = 0;
public static synchronized void countIncrease() {
count++;
}
public static synchronized void addkaifashang(String name,int zongjia,int junjia){
if(treemap.containsKey(name)){
kaifashangInfo tmp=treemap.get(name);
int tmpzongjia=tmp.getZongjia()+zongjia;
int tmpcount=tmp.getHousecount()+1;
int tmpjunjia=(tmp.getJunjia()+junjia)/2;
treemap.put(name,new kaifashangInfo(name,tmpcount,tmpzongjia,tmpjunjia));
}else{
treemap.put(name,new kaifashangInfo(name,1,zongjia,junjia));
}
}
public static Map<String,kaifashangInfo> treemap=new TreeMap<String,kaifashangInfo>();
public static void main(String[] args) throws IOException {
Runnable runnable = new Runnable() {
@Override
public void run() {
try {
for (int i = 1; i <= 100; i++) {
Document document = Jsoup.connect("http://esf.hz.fang.com/house-a0154/i3" + i)
.userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
.timeout(999999999)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate")
.header("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3")
.header("Connection", "keep-alive")
.header("Host", "esf.hz.fang.com")
//是忽略请求类型Id
.ignoreContentType(true)
.get();
Element div_element = document.getElementsByClass("shop_list").get(0);
Elements elements = div_element.select("dl");
for (Element element : elements) {
count++;
Elements elements1 = element.children();
Elements tmp = elements1.get(1).select("a");
if (tmp.size() == 0) continue;
String kaifashang = elements1.get(1).select("a").get(2).attr("title");
String address = elements1.get(1).getElementsByClass("add_shop").select("span").text();
String leixing = elements1.get(1).getElementsByClass("tel_shop").text();
Element ss = elements1.get(2);
String zongjia = elements1.get(2).getElementsByClass("red").text();
String junjia = elements1.get(2).select("span").get(1).text();
int junjia_final=(int)Double.parseDouble(junjia.substring(0,junjia.indexOf("元")));
int zongjia_final=(int)Double.parseDouble(zongjia.substring(0,zongjia.indexOf("万")))*10000;
addkaifashang(kaifashang,zongjia_final,junjia_final);
// if(kaifashang.equals("星耀城"))
System.out.println("Num"+count+" 开发商" + kaifashang + " 地址:" + address + " 户型:" + leixing + " 单价:" + junjia_final + "元/每平 总价:" + zongjia_final+"元");
}
}
System.out.println("总共" + count + "套房源!");
System.out.println("==========================================");
System.out.println("==========================================");
System.out.println("==========================================");
List<Map.Entry<String, kaifashangInfo>> list = new ArrayList<Map.Entry<String, kaifashangInfo>>(treemap.entrySet());
Collections.sort(list,new Comparator<Map.Entry<String,kaifashangInfo>>() {
//升序排序
public int compare(Map.Entry<String, kaifashangInfo> o1, Map.Entry<String, kaifashangInfo> o2) {
return (int)(o1.getValue().getJunjia()-o2.getValue().getJunjia());
}
});
System.out.println("下面是滨江区的开放商房价排行");
int houseCount=0;
for (Map.Entry<String, kaifashangInfo> e: list) {
houseCount+=e.getValue().getHousecount();
System.out.println(e.getKey()+" 均价:"+e.getValue().getJunjia()+"元/平"+ " 房源共"+e.getValue().getHousecount()+"套");
}
System.out.println("滨江现有房源"+houseCount+"套");
} catch (IOException e) {
e.printStackTrace();
}
}
};
new Thread(runnable).start();
}
}
下面就是开放商的类了,我都是用拼音进行命名的,应该很容易看懂吧。
/**
* Created by Precious_Life on 2018/8/15.
*/
public class kaifashangInfo {
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getHousecount() {
return housecount;
}
public void setHousecount(int housecount) {
this.housecount = housecount;
}
public int getZongjia() {
return zongjia;
}
public void setZongjia(int zongjia) {
this.zongjia = zongjia;
}
public int getJunjia() {
return junjia;
}
public void setJunjia(int junjia) {
this.junjia = junjia;
}
public String name;
int housecount;
int zongjia;
int junjia;
public kaifashangInfo(String name,int housecount,int zongjia,int junjia){
this.name=name;
this.housecount=housecount;
this.junjia=junjia;
this.zongjia=zongjia;
}
}
对于上面的结果,就是及时输出了下房源的信息,然后又对房源数据进行了下分析,做了下均价排行。
截下图~~
这个图是各开放商的房源均价排行
把这个数据爬下来之后,有啥用么??没啥用。。。。。哈哈哈,查了下房价最低的这个聆涛苑到滨江这边的距离
要两个小时呢!!!!!!!!!!!
扫描二维码关注公众号,回复:
2828905 查看本文章
!!!!!!!!!!!!!!!!!
还是老老实实的拼吧,好好工作然后傍个富婆,房子就有了哈哈哈哈哈哈