java网络爬虫爬取网站代码
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
public class WebSpiderTest {
public static void main(String[] args) {
try {
URL url = new URL("http://www.163.com");
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String temp =" ";
while((temp=reader.readLine())!=null) {
System.out.println(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
j
}
}
将上一个代码进行封装
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获得url对应的网页源码内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
}
}
}
获取网页的超连接,及网页乱码处理
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获得url对应的网页源码内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
//取超连接的整个内容
//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
//取到的超链接的地址
Pattern p = Pattern.compile("href=\"(.+?)\"");
Matcher m = p.matcher(destStr);
while(m.find()) {
System.out.println(m.group());
System.out.println(m.group(1));
}
}
}
将上一个代码进行优化
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获得url对应的网页源码内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static List<String>getMatherSubstrs(String destStr,String regexStr){
//取到的超链接的地址
Pattern p = Pattern.compile(regexStr);
Matcher m = p.matcher(destStr);
List<String> result = new ArrayList<String>();
while(m.find()) {
result .add(m.group(1));
}
return result;
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
//取超连接的整个内容
//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
List<String> result=getMatherSubstrs(destStr,"href=\"([\\w\\s./:]+?)\"");
for(String t:result) {
System.out.println(t);
}
}
}
喜欢我的可以关注我,我们可以一起交流学习
微信公众号:
让我爱上它Computer
qq群:473989408