解析xml文件并去重追加到csv文件

       最近的工作真的老是跟I/O流打交道,最新的任务是给一个xml的url,读取其中的数据并存入csv。。。哼,小仙女像怕事的人吗,看爸爸给你写出来。。。老规矩,贴代码。。。

public static void xmlWrite(String urlString, String filePath,
            Boolean judgeAppend) throws DocumentException {

        BufferedWriter writer = null;

        try {
            writer = new BufferedWriter(
                    new FileWriter(new File(filePath), true));
            SAXReader saxReader = new SAXReader();    //xml解析有sax解析和dom解析,我采用的是sax解析
            Document document = saxReader.read(new URL(urlString));
            Element root = document.getRootElement();
            List<Element> elist = root.elements("entry");
            System.out.println("elist.size:" + elist.size());


            //如果是第一次写入数据,则写上表头,如果是追加,就不写入(judgeAppend唯一的一丢丢用处)                                          if (judgeAppend == false) {
                String head = "id,title1,title2,title3,summary,updated,link,georss:line(georss:point)";
                writer.write(head);
                writer.newLine();
            }

            int line = 0;
            int point = 0;
            for (Element entry : elist) {
                StringBuffer sb = new StringBuffer();
                Boolean flag = false;
                if (entry.element("id") != null) {
                    String id = dealWithData(entry.element("id").getText()); //dealWithData(String)处理数据,数据中有逗号分隔符,就整体加上引号
                    List<String> ids = readCSVGetIds("D:/zcx/xmldata.csv"); //追加的情况下,去重,获取已存在数据的所有id
                    for (String idItem : ids) {
                        if (idItem.equals(id)) {
                            flag = true;
                            break;
                        }
                    }
                    if (flag == false) {
                        sb.append(id).append(",");
                    }()

                } else {
                    sb.append(" ").append(",");
                }

                if (flag == true) {
                    continue;
                }

                if (entry.element("title") != null) {
                    String title = dealWithData(entry.element("title")
                            .getText());
                    if (title.charAt(0) == '"'
                            && title.charAt(title.length() - 1) == '"') {
                        title = title.substring(1, title.length() - 2); // remove
                                                                        // ""
                    }
                    int firstIndex = title.indexOf('-', 0);
                    sb.append(dealWithData(title.substring(0, firstIndex - 1)))
                            .append(",");
                    int secondIndex = title.lastIndexOf('(');
                    sb.append(
                            dealWithData(title.substring(firstIndex + 2,
                                    secondIndex - 1))).append(",");
                    sb.append(
                            dealWithData(title.substring(secondIndex + 1,
                                    title.length() - 1))).append(",");
                } else {
                    sb.append(" ").append(",");
                }

                if (entry.element("summary") != null) {
                    String summary = dealWithData(entry.element("summary")
                            .getText());
                    summary = summary.substring(1, summary.length() - 1);
                    Pattern p1 = Pattern.compile("<strong>");
                    Matcher m1 = p1.matcher(summary);
                    String a1 = m1.replaceAll("; ");
                    Pattern p2 = Pattern.compile("</strong>");
                    Matcher m2 = p2.matcher(a1);
                    String afterSummary = m2.replaceAll("");
                    afterSummary = dealWithData(afterSummary.substring(1,
                            afterSummary.length()));
                    // System.out.println(afterSummary);
                    sb.append(afterSummary).append(",");
                } else {
                    sb.append(" ").append(",");
                }

                if (entry.element("updated") != null) {
                    String updated = dealWithData(entry.element("updated")
                            .getText());
                    sb.append(updated).append(",");
                } else {
                    sb.append(" ").append(",");
                }
                if (entry.element("link") != null) {
                    String link = dealWithData(entry.element("link").getText());
                    sb.append(link).append(",");
                } else {
                    sb.append(" ").append(",");
                }

                if (entry.element("line") != null) {  //这里需要注意的是xml解析中的标签,一个<entry></entry中>有<georss:line>或者<georss:point>,这样的数据获取就分为如下两种情况:
                    line++;          
                    String georssLine = entry.element("line").getText();
                    sb.append(georssLine).append(",");
                } else if (entry.element("point") != null) {
                    point++;
                    String georssPoint = entry.element("point").getText();
                    sb.append(georssPoint).append(",");
                } else {
                    sb.append(" ").append(",");
                }

                writer.write(sb.toString());
                writer.write(sb.toString());
                writer.newLine();
                writer.flush();
            }

            System.out.println("point:" + point);
            System.out.println("line:" + line);
            writer.close();   //这里必须爆粗口,特别气,上次这里忘记写,数据直接没写进去,这次写进去了,但是数据有丢失,缓冲果然还是不能忘记关啊
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

 路过的大佬,如若有误或者你认为不合理,请给出意见。。。。谢谢,手动笔芯。。。

猜你喜欢

转载自blog.csdn.net/zcx_hello/article/details/82684731