【WeChat applet+js】The practice of splitting strings and breaking paragraphs into sentences

text

The main reference is the example of splitting github paragraphs into sentences .
However, there is a problem with it. After the split, the punctuation marks at the end of the sentence are not preserved.
So it needs to be optimized.
Optimized way: Use the reference in the regular expression lookbehind
: [JS] split and keep the delimiter (not supported by Firefox browser)

Reference: Lookahead and lookbehind of regular expressions that can be studied in depth

expand

the code

onLoad: function (options) {
    
    
    let a = this.sentenceSplit(
      "Twenty percent impulsive clown, twenty percent professional nonconformist, sixty percent brilliant physicist, Feynman strived to be a great performer almost as much as to be a great physicist? [email protected]"
    );
    console.log(a)
  },

  //主函数
  sentenceSplit(originSentence) {
    
    
    //占位中间字符串
    let BOOK_PLACEHOLDER = "《%-#-@#》";
    let QUOTE_PLACEHOLDER = "“%-#-@#”。";
    let QUOTE_PLACEHOLDER_2 = "“%-#-@#”";

    if (originSentence === "") {
    
    
      console.log("this sentence is null");
      return null;
    }

    var bookList = this.getBookQuoteList('book',originSentence);
    var quoteList = this.getBookQuoteList('quote',originSentence);

    //以下两个for循环用作处理引号与书名所包含的endSymbol规则的中间代码
    var endSymbol = /[。!!??…]+/;
    for (var i = 0; i < bookList.length; i++) {
    
    
      if (endSymbol.test(bookList[i])) {
    
    
        //   console.log(bookList[i] + "-----是个假句子");
        originSentence = originSentence.replace(bookList[i], BOOK_PLACEHOLDER);
      } else {
    
    
        //   console.log(bookList[i] + "------不是假句子");
      }
    }
    for (var i = 0; i < quoteList.length; i++) {
    
    
      if (endSymbol.test(quoteList[i])) {
    
    
        originSentence = originSentence.replace(quoteList[i], QUOTE_PLACEHOLDER);
      }
    }

    //开始分句
    console.log(originSentence)
    const reg = /(?<=\。|\。”|\!”|\\.”|\\.’|\?”|\! |\?|\: |\; |\?|\!|\. )/
    console.log(reg)
    let sentenceList = originSentence.split(reg);

    console.log(sentenceList);
    var tempList = this.restoreBookAndQuote(bookList, sentenceList, BOOK_PLACEHOLDER);
    // console.log(tempList);

    tempList = this.restoreBookAndQuote(quoteList, sentenceList, QUOTE_PLACEHOLDER_2);

    sentenceList = tempList;
    // console.log(tempList);
    return sentenceList
  },


  //获取书名号引号包含的字符串数组
  getBookQuoteList(type, sentence) {
    
    
    if (type == 'book') {
    
    
      var pat = new RegExp("《([^《|》]*)》", "g");
    } else {
    
    
      var pat = new RegExp('(".*?")|(“.*?”)', "g");
    }
    let results = [];
    do {
    
    
      var res = pat.exec(sentence);
      if (res) {
    
    
        results.push(res[0]);
      }
    } while (res);
    return results;
  },

  //还原书名号与引号中内容
  restoreBookAndQuote(List, sentenceList, BookOrQuotePlaceHolder) {
    
    
    var endSymbol = /[。!!??…]+/;
    for (var i = 0; i < List.length; i++) {
    
    
      if (endSymbol.test(List[i])) {
    
    
        for (var j = 0; j < sentenceList.length; j++) {
    
    
          if (sentenceList[j].indexOf(BookOrQuotePlaceHolder) !== -1) {
    
    
            console.log(sentenceList[j]);
            let tempStr = sentenceList[j].replace(
              BookOrQuotePlaceHolder,
              List[i]
            );
            sentenceList.splice(j, 1, tempStr);
            break;
          }
        }
      }
    }
    return sentenceList;
  },

Guess you like

Origin blog.csdn.net/sinat_41838682/article/details/130342259