关于网页爬去编码问题,(编码格式设置无误,还是会出现乱码)

第一次发帖,可能个是不太注意 ,请大家谅解
最近遇到 2个蛋疼的 乱码问题,希望 万能的吧友 大神能帮忙解答

第一个 是: 爬取天气预报的时候 个别城市 的数据 乱码
url:
http://m.weather.com.cn/data/101110101.html

可以看到 头信息 编码神马的 都是UTF-8 ,返回值 也是UTF-8
上代码
public static String getResult(String urlStr,Proxy proxy) {
URL url = null;
HttpURLConnection connection = null;
try {
url = new URL(urlStr);
if(proxy == null){
connection = (HttpURLConnection) url.openConnection();
}else{
connection = (HttpURLConnection) url.openConnection(proxy);
}
 
connection.setDoOutput(true);
connection.setDoInput(true);
connection.setRequestMethod("GET");
connection.setUseCaches(false);
connection.setConnectTimeout(2000);
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setRequestProperty("Accept-Encoding", "gzip,deflate");
connection.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0");
connection.setRequestProperty("Cache-Control", "max-age=0");
connection.setRequestProperty("Connection", "keep-alive");
connection.setRequestProperty("Content-Type", "text/html; charset=utf-8");
 
connection.connect();
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));
StringBuffer buffer = new StringBuffer();
String line = "";
while ((line = reader.readLine()) != null) {
buffer.append(line);
}
reader.close();
return buffer.toString();
} catch (Exception e) {
log.error(e);
} finally {
if (connection != null) {
connection.disconnect();
}
}
return null;
}
//执行的  伪代码 
String json =  getResult("http://m.weather.com.cn/data/101110101.html",null); 

打印输出的 是 数据乱码 , (不是控制台的原因)
我爬取了大约2000+个城市的 天气,但是个别城市 有问题
我的解决方式 是用代理来访问,然后既可以解决乱码问题

第二个 是 google 地址解析 服务

ur:
http://maps.googleapis.com/maps/api/geocode/json?address=%E4%B8%8A%E6%B5%B7%E4%B8%8A%E6%B5%B7%E5%B8%82%E5%BE%90%E6%B1%87%E5%8C%BA%E7%BD%97%E7%A7%80%E8%B7%AF55%E5%BC%846%E5%8F%B7902%E5%AE%A4&sensor=true

在linux 下异常 但是在 windows 下正常
上代码

private String getGeocodingResultByAddr(String q) {
String site = "http://maps.googleapis.com/maps/api/geocode/json";
String params = "address=%s&sensor=false";
HttpURLConnection conn = null;
String json = "";
try {
params = String.format(params, URLEncoder.encode(q, charset));
URL url = new URL(site + "?" + params);
conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(5000);
conn.setDoInput(true);
conn.setRequestMethod("GET");
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
conn.setRequestProperty("Accept-Encoding","gzip");
conn.setRequestProperty("Accept-Language","zh-CN");
conn.setRequestProperty("Cache-Control","max-age=0");
conn.setRequestProperty("Cache-Control","keep-alive");
conn.setRequestProperty("Host","maps.googleapis.com");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)");
conn.setUseCaches(false);
conn.setDoOutput(true);
conn.setDoInput(true);
InputStream is = conn.getInputStream();
String contentType = conn.getContentType();
System.out.println(contentType);
if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
  ByteArrayOutputStream bytestream = new ByteArrayOutputStream(1024);
  int ch = 0;
  while ((ch = is.read()) != -1) {
  bytestream.write(ch);
  }
  byte imgdata[] = bytestream.toByteArray();
  System.out.println(new String(imgdata));
  System.out.println(new String(imgdata, "utf-8"));
  System.out.println(new String(imgdata, "gbk"));
  System.out.println(new String(imgdata, "gb18030"));
  bytestream.close();
}
is.close();
} catch (Exception e) {
json = null;
log.error("地址:" +q +"请求异常,结果"+json,e);
} finally {
if (conn != null)
conn.disconnect();
}
return json;
}
 
//执行的 伪代码 
getGeocodingResultByAddr("北京市海淀区上地");
 


我的解决方式 是用 httpclient
private String getGeocodingResultByAddr(String q) {
if(httpClient == null){
httpClient = new DefaultHttpClient();
httpClient.getParams().setParameter(HTTP.USER_AGENT,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36"); 
}
String site = "http://maps.googleapis.com/maps/api/geocode/json";
String params = "address=%s&sensor=false";
 
String json = "";
try {
params = String.format(params, URLEncoder.encode(q, charset));
String url = site + "?" + params;
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpGet.setHeader("Accept-Encoding","gzip");
httpGet.setHeader("Accept-Language","zh-CN");
HttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
BufferedReader br = new BufferedReader(new InputStreamReader(entity.getContent() , "UTF-8"));
StringBuffer buffer = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
buffer.append(line);
}
json = buffer.toString();
} catch (Exception e) {
json = null;
log.error("地址:" +q +"请求异常,结果"+json,e);
}
return json;
}


求大神 们解惑

猜你喜欢

转载自hanjk1234.iteye.com/blog/1896525