最近遇到 2个蛋疼的 乱码问题,希望 万能的吧友 大神能帮忙解答
第一个 是: 爬取天气预报的时候 个别城市 的数据 乱码
url:
http://m.weather.com.cn/data/101110101.html
可以看到 头信息 编码神马的 都是UTF-8 ,返回值 也是UTF-8
上代码
public static String getResult(String urlStr,Proxy proxy) { URL url = null; HttpURLConnection connection = null; try { url = new URL(urlStr); if(proxy == null){ connection = (HttpURLConnection) url.openConnection(); }else{ connection = (HttpURLConnection) url.openConnection(proxy); } connection.setDoOutput(true); connection.setDoInput(true); connection.setRequestMethod("GET"); connection.setUseCaches(false); connection.setConnectTimeout(2000); connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); connection.setRequestProperty("Accept-Encoding", "gzip,deflate"); connection.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0"); connection.setRequestProperty("Cache-Control", "max-age=0"); connection.setRequestProperty("Connection", "keep-alive"); connection.setRequestProperty("Content-Type", "text/html; charset=utf-8"); connection.connect(); BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8")); StringBuffer buffer = new StringBuffer(); String line = ""; while ((line = reader.readLine()) != null) { buffer.append(line); } reader.close(); return buffer.toString(); } catch (Exception e) { log.error(e); } finally { if (connection != null) { connection.disconnect(); } } return null; } //执行的 伪代码 String json = getResult("http://m.weather.com.cn/data/101110101.html",null);
打印输出的 是 数据乱码 , (不是控制台的原因)
我爬取了大约2000+个城市的 天气,但是个别城市 有问题
我的解决方式 是用代理来访问,然后既可以解决乱码问题
第二个 是 google 地址解析 服务
ur:
http://maps.googleapis.com/maps/api/geocode/json?address=%E4%B8%8A%E6%B5%B7%E4%B8%8A%E6%B5%B7%E5%B8%82%E5%BE%90%E6%B1%87%E5%8C%BA%E7%BD%97%E7%A7%80%E8%B7%AF55%E5%BC%846%E5%8F%B7902%E5%AE%A4&sensor=true
在linux 下异常 但是在 windows 下正常
上代码
private String getGeocodingResultByAddr(String q) { String site = "http://maps.googleapis.com/maps/api/geocode/json"; String params = "address=%s&sensor=false"; HttpURLConnection conn = null; String json = ""; try { params = String.format(params, URLEncoder.encode(q, charset)); URL url = new URL(site + "?" + params); conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(5000); conn.setDoInput(true); conn.setRequestMethod("GET"); conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); conn.setRequestProperty("Accept-Encoding","gzip"); conn.setRequestProperty("Accept-Language","zh-CN"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("Cache-Control","keep-alive"); conn.setRequestProperty("Host","maps.googleapis.com"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"); conn.setUseCaches(false); conn.setDoOutput(true); conn.setDoInput(true); InputStream is = conn.getInputStream(); String contentType = conn.getContentType(); System.out.println(contentType); if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) { ByteArrayOutputStream bytestream = new ByteArrayOutputStream(1024); int ch = 0; while ((ch = is.read()) != -1) { bytestream.write(ch); } byte imgdata[] = bytestream.toByteArray(); System.out.println(new String(imgdata)); System.out.println(new String(imgdata, "utf-8")); System.out.println(new String(imgdata, "gbk")); System.out.println(new String(imgdata, "gb18030")); bytestream.close(); } is.close(); } catch (Exception e) { json = null; log.error("地址:" +q +"请求异常,结果"+json,e); } finally { if (conn != null) conn.disconnect(); } return json; } //执行的 伪代码 getGeocodingResultByAddr("北京市海淀区上地");
我的解决方式 是用 httpclient
private String getGeocodingResultByAddr(String q) { if(httpClient == null){ httpClient = new DefaultHttpClient(); httpClient.getParams().setParameter(HTTP.USER_AGENT,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36"); } String site = "http://maps.googleapis.com/maps/api/geocode/json"; String params = "address=%s&sensor=false"; String json = ""; try { params = String.format(params, URLEncoder.encode(q, charset)); String url = site + "?" + params; HttpGet httpGet = new HttpGet(url); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpGet.setHeader("Accept-Encoding","gzip"); httpGet.setHeader("Accept-Language","zh-CN"); HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); BufferedReader br = new BufferedReader(new InputStreamReader(entity.getContent() , "UTF-8")); StringBuffer buffer = new StringBuffer(); String line = null; while ((line = br.readLine()) != null) { buffer.append(line); } json = buffer.toString(); } catch (Exception e) { json = null; log.error("地址:" +q +"请求异常,结果"+json,e); } return json; }
求大神 们解惑