Android进阶 十三 网络爬虫 json应用

               

    刚开始接触网络爬虫,怎一个“菜”字了得!经过几次的折磨,对其原理以及其中用到的json技术有了大致的了解,故作一总结,供有同样迷惑的朋友参考学习。

    自己爬取的网站内容为12306的余票查询模块。利用火狐浏览器为Web开发者置的Web控制台,可得到爬取网页的请求网址,如下图所示:


其中的请求网址即为我们需要爬取的网址。另外,可得知其请求协议采用的是Https协议,采用GET方式访问。爬取源代码如下所示:
        public static String queryDate = "2015-04-19"public static String from_station = "JNK"public static String to_station = "BJP"public static void main(String[] args) throws Exception {  HostnameVerifier hv = new HostnameVerifier() {   public boolean verify(String urlHostName, SSLSession session) {    System.out.println("Warning: URL Host: " + urlHostName      + " vs. " + session.getPeerHost());    return true;   }  };        String url = "https://kyfw.12306.cn/otn/lcxxcx/query?purpose_codes=ADULT&queryDate="    + queryDate    + "&from_station="    + from_station    + "&to_station="    + to_station;  ProtocolUrlValidator.trustAllHttpsCertificates();  HttpsURLConnection.setDefaultHostnameVerifier(hv);  String result = WebServiceUtil.invokeByHTTPGET(url, null);    Gson gson = new Gson();  Trains trains = gson.fromJson(result, Trains.class);    List<Item> items = trains.getData().getItems();    if (trains.getHttpstatus() != 200) {   trains.getMessages();  } else {   if (items != null && items.size() != 0)    for (Item item : items) {     System.out.println(item);    }  } }}
<span style="font-size:14px;">由于使用的协议为Https,故访问之前需要先进行证书的校验。其中蓝色代码块为我们需要访问的网址,涉及到的invokeByHTTPGET(url,null)代码如下所示:</span>
<span style="font-size:18px;">public class WebServiceUtil /**  * 通过SOAP1.1协议调用Web服务  *   * @param wsdl  WSDL路径  * @param method 方法名  * @param namespace 命名空间  * @param headerParameters 头参数  * @param bodyParameters   体参数  * @param isBodyParametersNS 体参数是否有命名空间  * @return String  * @throws Exception  */ public static String invokeBySoap11(String wsdl, String method,   String namespace, Map<String, String> headerParameters,   Map<String, String> bodyParameters, boolean isBodyParametersNS)   throws Exception {  StringBuffer soapOfResult = null;  // 去除 ?wsdl,获取方法列表  int length = wsdl.length();  wsdl = wsdl.substring(0, length - 5);  URL url = new URL(wsdl);  HttpURLConnection conn = (HttpURLConnection) url.openConnection();  conn.setRequestMethod("POST");  conn.setDoInput(true);  conn.setDoOutput(true);  conn.setRequestProperty("Content-Type", "text/xml;charset=utf-8");  OutputStream out = conn.getOutputStream();  // 获取soap1.1版本消息  StringBuilder sb = new StringBuilder();  sb.append("<soap:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"                 xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\" xmlns:soap=\"http://schemas.xmlsoap.org/soap/envelope/\" ");  sb.append("xmlns:ns0=\"" + namespace + "\"");  sb.append(">");  if (headerParameters != null) {   sb.append("<soap:Header>");   for (Entry<String, String> headerParameter : headerParameters     .entrySet()) {    sb.append("<ns0:");    sb.append(headerParameter.getKey());    sb.append(">");    sb.append(headerParameter.getValue());    sb.append("</ns0:");    sb.append(headerParameter.getKey());    sb.append(">");   }   sb.append("</soap:Header>");  }  sb.append("<soap:Body><ns0:");  sb.append(method);  sb.append(">");  // 输入参数  if (bodyParameters != null) {   for (Entry<String, String> inputParameter : bodyParameters     .entrySet()) {    if (isBodyParametersNS) {     sb.append("<ns0:");     sb.append(inputParameter.getKey());     sb.append(">");     sb.append(inputParameter.getValue());     sb.append("</ns0:");     sb.append(inputParameter.getKey());     sb.append(">");    } else {     sb.append("<");     sb.append(inputParameter.getKey());     sb.append(">");     sb.append(inputParameter.getValue());     sb.append("</");     sb.append(inputParameter.getKey());     sb.append(">");    }   }  }  sb.append("</ns0:");  sb.append(method);  sb.append("></soap:Body></soap:Envelope>");  //System.out.println(sb.toString());  out.write(sb.toString().getBytes());  int code = conn.getResponseCode();  if (code == 200) {   InputStream is = conn.getInputStream();   byte[] b = new byte[1024];   int len = 0;   soapOfResult = new StringBuffer();   while ((len = is.read(b)) != -1) {    String s = new String(b, 0, len, "UTF-8");    soapOfResult.append(s);   }  }  conn.disconnect();  return soapOfResult == null ? null : soapOfResult.toString(); } /**  * 通过SOAP1.2协议调用Web服务  *   * @param wsdl  * @param method  * @param namespace  * @param headerParameters  * @param bodyParameters  * @param isBodyParametersNS  * @return  * @throws Exception  */ public static String invokeBySoap12(String wsdl, String method,   String namespace, Map<String, String> headerParameters,   Map<String, String> bodyParameters, boolean isBodyParametersNS)   throws Exception {  StringBuffer soapOfResult = null;  // 去除 ?wsdl  int length = wsdl.length();  wsdl = wsdl.substring(0, length - 5);  URL url = new URL(wsdl);  HttpURLConnection conn = (HttpURLConnection) url.openConnection();  conn.setRequestMethod("POST");  conn.setDoInput(true);  conn.setDoOutput(true);  conn.setRequestProperty("Content-Type", "text/xml;charset=utf-8");  OutputStream out = conn.getOutputStream();  // 获取soap1.1版本消息  StringBuilder sb = new StringBuilder();  sb.append("<soap12:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"                 xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\" soap12:Envelope xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" ");  sb.append("xmlns:ns0=\"" + namespace + "\"");  sb.append(">");  if (headerParameters != null) {   sb.append("<soap12:Header>");   for (Entry<String, String> headerParameter : headerParameters     .entrySet()) {    sb.append("<ns0:");    sb.append(headerParameter.getKey());    sb.append(">");    sb.append(headerParameter.getValue());    sb.append("</ns0:");    sb.append(headerParameter.getKey());    sb.append(">");   }   sb.append("</soap12:Header>");  }  sb.append("<soap12:Body><ns0:");  sb.append(method);  sb.append(">");  // 输入参数  if (bodyParameters != null) {   for (Entry<String, String> inputParameter : bodyParameters     .entrySet()) {    if (isBodyParametersNS) {     sb.append("<ns0:");     sb.append(inputParameter.getKey());     sb.append(">");     sb.append(inputParameter.getValue());     sb.append("</ns0:");     sb.append(inputParameter.getKey());     sb.append(">");    } else {     sb.append("<");     sb.append(inputParameter.getKey());     sb.append(">");     sb.append(inputParameter.getValue());     sb.append("</");     sb.append(inputParameter.getKey());     sb.append(">");    }   }  }  sb.append("</ns0:");  sb.append(method);  sb.append("></soap12:Body></soap12:Envelope>");  System.out.println(sb.toString());  out.write(sb.toString().getBytes());  int code = conn.getResponseCode();  if (code == 200) {   InputStream is = conn.getInputStream();   byte[] b = new byte[1024];   int len = 0;   soapOfResult = new StringBuffer();   while ((len = is.read(b)) != -1) {    String s = new String(b, 0, len, "UTF-8");    soapOfResult.append(s);   }  }  conn.disconnect();  return soapOfResult == null ? null : soapOfResult.toString(); } /**  * 通过HTTP POST传参方式调用服务  *   * @param urlPath  * @param method  * @param namespace  * @param inputParameters  * @return  * @throws Exception  */ public static String invokeByHTTPPOST(String urlPath, Map<String, String> inputParameters)   throws Exception {  StringBuffer resultStr = null;  URL url = new URL(urlPath);  HttpURLConnection conn = (HttpURLConnection) url.openConnection();  conn.setRequestMethod("POST");  conn.setDoInput(true);  conn.setDoOutput(true);  conn.setRequestProperty("Content-Type",    "application/x-www-form-urlencoded");  StringBuilder sb = new StringBuilder();  // 输入参数  if (inputParameters != null) {   for (Entry<String, String> inputParameter : inputParameters     .entrySet()) {    sb.append(inputParameter.getKey());    sb.append("=");    sb.append(inputParameter.getValue());    sb.append("&");   }   sb.deleteCharAt(sb.length() - 1);  }  System.out.println(sb.toString());  OutputStream out = conn.getOutputStream();  out.write(sb.toString().getBytes());  int code = conn.getResponseCode();  if (code == 200) {   InputStream is = conn.getInputStream();   byte[] b = new byte[1024];   int len = 0;   resultStr = new StringBuffer();   while ((len = is.read(b)) != -1) {    String s = new String(b, 0, len, "UTF-8");    resultStr.append(s);   }  }  conn.disconnect();  return resultStr == null ? null : resultStr.toString(); } /**  * 通过HTTP GET传参方式调用服务  *   * @param urlPath   url路径  * @param method 方法名  * @param namespace 命名空间  * @param inputParameters 输入参数  * @return String  * @throws Exception  */ public static String invokeByHTTPGET(String urlPath,  Map<String, String> inputParameters)   throws Exception {  StringBuilder sb = new StringBuilder();  sb.append(urlPath);  // GET参数  if (inputParameters != null) {   sb.append("?");   //entrySet()方法 返回此映射中包含的映射关系的 set 视图集合   //Map.Entry表示单个映射关系即一个key+value   for (Entry<String, String> inputParameter : inputParameters     .entrySet()) {    sb.append(inputParameter.getKey());    sb.append("=");    sb.append(inputParameter.getValue());    sb.append("&");   }   //作用:去除最后一个拼接的'&'字符   sb.deleteCharAt(sb.length() - 1);  }  System.out.println(sb.toString());  URL url = new URL(sb.toString());  HttpURLConnection conn = (HttpURLConnection) url.openConnection();  conn.setRequestMethod("GET");  conn.setDoOutput(true);  int code = conn.getResponseCode();  StringBuffer resultString = null;  if (code == 200) {   InputStream is = conn.getInputStream();   byte[] b = new byte[4096];   int len = 0;   resultString = new StringBuffer();   while ((len = is.read(b)) != -1) {    String s = new String(b, 0, len, "UTF-8");    //System.out.println(len+">>>>"+s);    resultString.append(s);   }  }  conn.disconnect();  return resultString == null ? null : resultString.toString(); }</span>
 
   
  

以上代码块涉及到的发送请求方式有通过SOAP1.1协议调用Web服务、通过SOAP1.2协议调用Web服务、

通过HTTP POST传参方式调用服务和通过HTTP GET传参方式调用服务。其具体的请求方式在源代码中以注释方式以详细给出,故此处不再赘述。

在爬取过程中,我们还需要用到json在线校验工具,网址为:点击打开链接。主要利用此工具完成的操作为:验证json格式的正确性,根据json串生成相应的POJO类。如下图所示:

                                                                                                                                           json格式校验

                                                                                                                                                                                                      生成POJO类
<span style="font-size:18px;">至此,网络爬虫的过程基本结束。此次实验的返回结果如下图所示:</span>

注:自己对GSon解析json还存在一定的误区。对于json串中的Key,其实是与相应类中的变量名一一对应的,否则,在解析式将会是null!例如,

以上的json串中存在key为data,则在创建POJO时,不可随意更变变量名,若将private List<Item> datas;改写为private List<Item> items;则会使变List<Item> items = trains.getData().getItems();返回null。若自己需要更改变量名的话,可以采用注解的方式解决,例如@SerializedName("datas")private List<Item> items;

           

再分享一下我老师大神的人工智能教程吧。零基础!通俗易懂!风趣幽默!还带黄段子!希望你也加入到我们人工智能的队伍中来!https://blog.csdn.net/jiangjunshow

猜你喜欢

转载自blog.csdn.net/qq_43667184/article/details/87196915
今日推荐