text
The main reference is the example of splitting github paragraphs into sentences .
However, there is a problem with it. After the split, the punctuation marks at the end of the sentence are not preserved.
So it needs to be optimized.
Optimized way: Use the reference in the regular expression lookbehind
: [JS] split and keep the delimiter (not supported by Firefox browser)
Reference: Lookahead and lookbehind of regular expressions that can be studied in depth
expand
- The scheme of implementing word segmentation in the WeChat applet (not practiced)
1. Advice: How to implement the Chinese word segmentation function such as jieba in python in the applet? - String segmentation skills you may not know : using Intl API applets that cannot be used
the code
onLoad: function (options) {
let a = this.sentenceSplit(
"Twenty percent impulsive clown, twenty percent professional nonconformist, sixty percent brilliant physicist, Feynman strived to be a great performer almost as much as to be a great physicist? [email protected]"
);
console.log(a)
},
//主函数
sentenceSplit(originSentence) {
//占位中间字符串
let BOOK_PLACEHOLDER = "《%-#-@#》";
let QUOTE_PLACEHOLDER = "“%-#-@#”。";
let QUOTE_PLACEHOLDER_2 = "“%-#-@#”";
if (originSentence === "") {
console.log("this sentence is null");
return null;
}
var bookList = this.getBookQuoteList('book',originSentence);
var quoteList = this.getBookQuoteList('quote',originSentence);
//以下两个for循环用作处理引号与书名所包含的endSymbol规则的中间代码
var endSymbol = /[。!!??…]+/;
for (var i = 0; i < bookList.length; i++) {
if (endSymbol.test(bookList[i])) {
// console.log(bookList[i] + "-----是个假句子");
originSentence = originSentence.replace(bookList[i], BOOK_PLACEHOLDER);
} else {
// console.log(bookList[i] + "------不是假句子");
}
}
for (var i = 0; i < quoteList.length; i++) {
if (endSymbol.test(quoteList[i])) {
originSentence = originSentence.replace(quoteList[i], QUOTE_PLACEHOLDER);
}
}
//开始分句
console.log(originSentence)
const reg = /(?<=\。|\。”|\!”|\\.”|\\.’|\?”|\! |\?|\: |\; |\?|\!|\. )/
console.log(reg)
let sentenceList = originSentence.split(reg);
console.log(sentenceList);
var tempList = this.restoreBookAndQuote(bookList, sentenceList, BOOK_PLACEHOLDER);
// console.log(tempList);
tempList = this.restoreBookAndQuote(quoteList, sentenceList, QUOTE_PLACEHOLDER_2);
sentenceList = tempList;
// console.log(tempList);
return sentenceList
},
//获取书名号引号包含的字符串数组
getBookQuoteList(type, sentence) {
if (type == 'book') {
var pat = new RegExp("《([^《|》]*)》", "g");
} else {
var pat = new RegExp('(".*?")|(“.*?”)', "g");
}
let results = [];
do {
var res = pat.exec(sentence);
if (res) {
results.push(res[0]);
}
} while (res);
return results;
},
//还原书名号与引号中内容
restoreBookAndQuote(List, sentenceList, BookOrQuotePlaceHolder) {
var endSymbol = /[。!!??…]+/;
for (var i = 0; i < List.length; i++) {
if (endSymbol.test(List[i])) {
for (var j = 0; j < sentenceList.length; j++) {
if (sentenceList[j].indexOf(BookOrQuotePlaceHolder) !== -1) {
console.log(sentenceList[j]);
let tempStr = sentenceList[j].replace(
BookOrQuotePlaceHolder,
List[i]
);
sentenceList.splice(j, 1, tempStr);
break;
}
}
}
}
return sentenceList;
},