词根统计系统 实现背单词计划

人生啊,总是在不断变化,往往会在出其不意的地方出现Bug,对此我们需要万分小心

在词根统计的功能上和爬虫联系起来,从 https://www.etymonline.com/ 网站爬取相关的解释。接口为:

https://www.etymonline.com/search?q=

利用爬虫进行解析

public class SkillOfWords {


    private static Map<String,String> wordfanyicache = new HashMap<String, String>();

    private static void getwordfanyicache(String name) throws IOException {
        if (wordfanyicache.size() == 0){
            name = name + "_fanyi.txt";
            File file = new File(name);
            if (file.exists()){
                InputStream inputStream = new FileInputStream(file);
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
                String line = null;
                int cnt = 0;
                while ((line = bufferedReader.readLine()) != null){
                    String[] tmp = line.split(" ");
                    int n = tmp[0].length();
                    if (n>0){
                        String word = tmp[0].substring(0,n-1);
                        if (cnt == 0)System.out.println(word);
                        String value = "";
                        if (tmp.length==2){
                            value = tmp[1];
                        }
                        if (word == "" || value == "")continue;
                        try {
                            wordfanyicache.put(word, value);
                        }catch (NullPointerException ue){

                        }
                        cnt ++;
                    }
                }
                System.out.println("终于读完了");
            }else {
                System.out.println("翻译文件不存在");
            }
        }
    }


    public static void getSkill(String name) throws IOException {
        if (wordfanyicache.size() == 0)getwordfanyicache(name);
        String nametmp = name;
        name = name + ".txt";
        File file = new File(name);
        if (file.exists()){
            InputStream inputStream = new FileInputStream(file);
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
            String line = null;
            String word = "";
            String wordtmp = "";
            int cntline = 0;
            while ((line = bufferedReader.readLine()) != null){
                cntline ++;
                wordtmp = wordtmp + line;
                if (cntline == 10){
                    word = word + wordtmp;
                    wordtmp = "";
                    cntline = 0;
                }
            }
            word = word + wordtmp;
            String[] words = word.split(" ");
            int cntword = 0;
            String jihua = "";
            int cntjihua = getJihuaTian(nametmp);
            int totalwords = 0;
            for (String url : words){
                if (url.length()==0)continue;
                cntword ++;
                totalwords ++;
                if (totalwords < 10*cntjihua){
                    cntword = 0;
                    continue;
                }
                int n = cntword ;
                String ans = n + "、" + url;
                String w = wordfanyicache.get(url);
                url = Link.WORD_DETAIL_BASE.getLink() + url;
                ans = ans +":  "+url+"意思是: "+w+"\n";
                WebEntity webEntity = new WebEntity(url);
                Craw craw = Craw.getInstance();
                HtmlPage page = craw.parsePage(webEntity);
                YeMian yeMian = YeMian.WORD_DETAIL;
                if (page != null) {
                    // TODO: 2018/12/11 解析page并进行存储,每10个单词存一个文件,生成每天的任务
                    String html = page.asXml();
                    ans = ans + LabelUtil.analyzeHTMLByString(html, yeMian);
                }
                jihua = jihua +"\n"+ ans;
                if (cntword == 10){
                    String newname = nametmp + "\\jihua_" + cntjihua + ".txt";
                    File file1 = new File(newname);
                    if (!file1.exists()){
                        file1.createNewFile();
                    }
                    FileOutputStream fileOutputStream = new FileOutputStream(file1);
                    BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
                    bufferedOutputStream.write(jihua.getBytes());
                    bufferedOutputStream.flush();
                    bufferedOutputStream.close();
                    fileOutputStream.close();
                    System.out.println("第"+cntjihua+"天,生成完成");
                    saveJihuaTian(cntjihua+1,nametmp);
                    jihua = "";
                    cntjihua ++;
                    cntword = 0;
                }

            }
        }else {
            System.out.println("文件不存在");
        }
    }

    private static void saveJihuaTian(int jihua,String name) throws IOException {
        String newname = name + "_jihua_jilu.txt";
        File file = new File(newname);
        if (!file.exists()){
            file.createNewFile();
        }
        FileOutputStream fileOutputStream = new FileOutputStream(file);
        BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
        String ans = "";
        ans = ans + jihua;
        bufferedOutputStream.write(ans.getBytes());
        bufferedOutputStream.flush();
        bufferedOutputStream.close();
        fileOutputStream.close();
    }

    private static int getJihuaTian(String name) throws IOException {
        String newname = name + "_jihua_jilu.txt";
        File file = new File(newname);
        if (file.exists()){
            InputStream inputStream = new FileInputStream(file);
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
            String line = bufferedReader.readLine();
            if (line != null){
               int ans = Integer.parseInt(line);
               return ans;
            }else {
                return 0;
            }
        }else {
            return 0;
        }
    }
}

解析html

public class LabelUtil {
    public static String analyzeHTMLByString(String html,YeMian yeMian){
        String ans = "";
        Document document = Jsoup.parse(html);
        if (yeMian == YeMian.WORD_DETAIL){
            try {
                Element element = document.select(".word--C9UPa").first().select("section").first();
                ans = handleHtmlLabel(element.toString());
            }catch (NullPointerException ue){
                System.out.println("不存在");
            }
        }
        return ans;
    }
    public static String handleHtmlLabel(String html){
        String noHTMLString = "";
        html = html.replaceAll("&amp;", "&");
        Matcher m = Pattern
                .compile("&#(\\d+);", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                .matcher(html);
        boolean b = false;
        int i = 0;
        while (m.find()) {
            if (i > 500) {
                System.out.println(i);
            }
            i++;
            html = html.replace("&#" + m.group(1) + ";", (char) Integer.parseInt(m.group(1)) + "");
            b = true;
        }
        if (!b) {
            m = Pattern
                    .compile("&#x([\\da-f]+);",
                            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                    .matcher(html);
            int j = 0;
            while (m.find()) {
                if (j > 500) {
                    System.out.println(j);
                }
                j++;
                html = html.replaceAll("&#[x|X]" + m.group(1) + ";", (char) Integer.parseInt(m.group(1), 16) + "");
            }
        }
        String scl = "<script";//8
        String scr = "</script>";//9
        int indexl = -1;
        indexl = html.indexOf(scl);
        long mm = html.length();
        while (indexl != -1){
            int indexr = -1;
            indexr = html.indexOf(scr);
            if (indexl != 0){
                String x = html.substring(0,indexl);
                int n = html.length();
                if (indexr != n-9 && indexr != -1) { ;
                    String y = html.substring(indexr+9,n-1);
                    html = x+y;

                }else if (indexr == n-9 || indexr == -1){
                    html= x;

                }
            }else {
                int n = html.length();
                if (indexr != n-9 && indexr != -1){
                    String y = html.substring(indexr+9,n-1);
                    html = y;

                }else if(indexr == n-9){
                    html = "";
                }else if(indexr == -1){
                    html = "";
                }
            }
            indexl = -1;
            indexl = html.indexOf(scl);
        }
        noHTMLString = html.replaceAll("<\\s*(?:br|Br|BR|bR|div|DIV|Div|p|P|td|TD|Td)\\s*(?:[^>])*\\s*>", "")
                .replaceAll("", "").replaceAll("&nbsp;", "").replaceAll("\\<.*?\\>", "")
                .replaceAll("&(?:g|l)t", "");
        String x = "";
        Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
        Matcher matcher = pattern.matcher(noHTMLString);
        x = matcher.replaceAll("");
        return noHTMLString.trim();
    }
}

在这里插入图片描述

功能还没写完,待更新

猜你喜欢

转载自blog.csdn.net/xielinrui123/article/details/84957413