让DeepLearning4j阅读博彩网站定制开发小说并给出关联度最高的词

是一个博彩网站定制开发【大神源码论坛】dsluntan.com  【布丁源码论坛】budingbbs.com 企娥3393756370。java的神经网络框架,便于java程序员使用神经网络来完成一些机器学习工程。

不管什么机器学习框架,NLP是一个不能不谈的领域,DL4J也提供了nlp的相关实现。其中入门的例子就是从一大堆文字中找到最相关的词。

我们先来看看官方的demo,然后再模仿一个类似的程序,只不过是阅读中文的小说。

官方的demo叫Word2VecRawTextExample,我们直接新建一个java的maven项目,pom.xml如下:

  1. <?xml version= "1.0" encoding="UTF-8"?>
  2. <project xmlns= "http://maven.apache.org/POM/4.0.0"
  3. xmlns:xsi= "http://www.w3.org/2001/XMLSchema-instance"
  4. xsi:schemaLocation= "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5. <modelVersion> 4.0.0</modelVersion>
  6.  
  7. <groupId>com.tianyalei</groupId>
  8. <artifactId>wolf_ml_mnist</artifactId>
  9. <version> 1.0-SNAPSHOT</version>
  10.  
  11. <properties>
  12. <project.build.sourceEncoding>UTF- 8</project.build.sourceEncoding>
  13. <nd4j.version> 1.0.0-beta</nd4j.version>
  14. <dl4j.version> 1.0.0-beta</dl4j.version>
  15. <datavec.version> 1.0.0-beta</datavec.version>
  16. <logback.version> 1.1.7</logback.version>
  17. <scala.binary.version> 2.10</scala.binary.version>
  18. </properties>
  19.  
  20. <dependencies>
  21. <!--神经网络的实现方法-->
  22. <dependency>
  23. <groupId>org.deeplearning4j</groupId>
  24. <artifactId>deeplearning4j-core</artifactId>
  25. <version>${dl4j.version}</version>
  26. </dependency>
  27. <!--ND4J库的CPU版本,驱动DL4J-->
  28. <!--nd4j-cuda- 9.1-platform,写成这个就是GPU-->
  29. <dependency>
  30. <groupId>org.nd4j</groupId>
  31. <artifactId>nd4j- native-platform</artifactId>
  32. <!--<artifactId>nd4j-cuda- 9.1-platform</artifactId>-->
  33. <version>${nd4j.version}</version>
  34. </dependency>
  35. <dependency>
  36. <groupId>org.deeplearning4j</groupId>
  37. <artifactId>deeplearning4j-nlp</artifactId>
  38. <version>${dl4j.version}</version>
  39. </dependency>
  40. <dependency>
  41. <groupId>org.deeplearning4j</groupId>
  42. <artifactId>deeplearning4j-ui_2.11</artifactId>
  43. <version>${dl4j.version}</version>
  44. </dependency>
  45. <dependency>
  46. <groupId>ch.qos.logback</groupId>
  47. <artifactId>logback-classic</artifactId>
  48. <version>${logback.version}</version>
  49. </dependency>
  50. <!--中文分词start-->
  51. <!--<dependency>
  52. <groupId>org.fnlp</groupId>
  53. <artifactId>fnlp-core</artifactId>
  54. <version> 2.1-SNAPSHOT</version>
  55. </dependency>-->
  56. <dependency>
  57. <groupId>net.sf.trove4j</groupId>
  58. <artifactId>trove4j</artifactId>
  59. <version> 3.0.3</version>
  60. </dependency>
  61. <dependency>
  62. <groupId>commons-cli</groupId>
  63. <artifactId>commons-cli</artifactId>
  64. <version> 1.2</version>
  65. </dependency>
  66. <!--中文分词end-->
  67. </dependencies>
  68. </project>
  1. package com.tianyalei.nlp;
  2.  
  3. import org.deeplearning4j.models.word2vec.Word2Vec;
  4. import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
  5. import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
  6. import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
  7. import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
  8. import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
  9. import org.nd4j.linalg.io.ClassPathResource;
  10. import org.slf4j.Logger;
  11. import org.slf4j.LoggerFactory;
  12.  
  13. import java.util.Collection;
  14.  
  15. /**
  16. * Created by agibsonccc on 10/9/14.
  17. *
  18. * Neural net that processes text into wordvectors. See below url for an in-depth explanation.
  19. * https://deeplearning4j.org/word2vec.html
  20. */
  21. public class Word2VecRawTextExample {
  22.  
  23. private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class);
  24.  
  25. public static void main(String[] args) throws Exception {
  26.  
  27. // Gets Path to Text file
  28. String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
  29.  
  30. log.info( "Load & Vectorize Sentences....");
  31. // Strip white space before and after for each line
  32. SentenceIterator iter = new BasicLineIterator(filePath);
  33. // Split on white spaces in the line to get words
  34. TokenizerFactory t = new DefaultTokenizerFactory();
  35.  
  36. /*
  37. CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
  38. So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
  39. Additionally it forces lower case for all tokens.
  40. */
  41. t.setTokenPreProcessor( new CommonPreprocessor());
  42.  
  43. log.info( "Building model....");
  44. Word2Vec vec = new Word2Vec.Builder()
  45. //是一个词在语料中必须出现的最少次数。本例中出现不到五次的词都不予学习。
  46. .minWordFrequency( 5)
  47. //是网络在处理一批数据时允许更新系数的次数。迭代次数太少,网络可能来不及学习所有能学到的信息;迭代次数太多则会导致网络定型时间变长。
  48. .iterations( 1)
  49. //指定词向量中的特征数量,与特征空间的维度数量相等。以500个特征值表示的词会成为一个500维空间中的点。
  50. .layerSize( 100)
  51. .seed( 42)
  52. .windowSize( 5)
  53. //告知网络当前定型的是哪一批数据集
  54. .iterate(iter)
  55. //将当前一批的词输入网络
  56. .tokenizerFactory(t)
  57. .build();
  58.  
  59. log.info( "Fitting Word2Vec model....");
  60. vec.fit();
  61.  
  62. log.info( "Writing word vectors to text file....");
  63.  
  64. // Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
  65. log.info( "Closest Words:");
  66. Collection<String> lst = vec.wordsNearestSum( "day", 10);
  67. //Collection<String> lst = vec.wordsNearest(Arrays.asList("king", "woman"), Arrays.asList("queen"), 10);
  68. log.info( "10 Words closest to 'day': {}", lst);
  69. }
  70. }

这就是NLP的helloworld级的入门项目,目标是从给定的raw_sentences.txt中找到与day最相近的词,将资源放到resource中,运行该程序即可。

运行结果:

可以看到,day的最相近博彩网站定制开发【大神源码论坛】dsluntan.com  【布丁源码论坛】budingbbs.com 企娥3393756370的词有week、night、year等,还算非常靠谱。至于原理呢,大家可以在文档里去搜索day这个词,看看它的附近的词和用法,然后再去搜索week、night等词的旁边的词和用法,就大概知道怎么回事了。

该文用的相关资源去我项目里找https://github.com/tianyaleixiaowu/wolf_ml_mnist

从代码的注释中可以看看基本的概念,下面我们来让它学习一下中文的小说,并给出最接近的词。

和英文自带空格分词不同,中文是额外需要一个中文分词器的,不然中文全是一句一句的,根本分不开。所以我们在让机器学习读中文前,需要先将中文句子分成一个个的词语。

中文分词器很多,论使用的简易程度和效果,还是复旦的NLP比较靠谱,https://github.com/FudanNLP/fnlp。

GitHub上面有文档,讲怎么使用的,这里我直接简单说一下,下载models里的三个.m文件,和libs里的fnlp-code.jar,将jar添加为工程的依赖lib即可。至于复旦nlp额外需要的两个jar,我已经放在pom.xml里了。

然后就可以使用fnlp来对文档进行分词了。我们选择的文档是天龙八部tlbb.txt,这是没分词时的样子。

分词的代码

  1. package com.tianyalei.nlp.tlbb;
  2.  
  3. import java.io.*;
  4.  
  5. /**
  6. * 运行后将得到一个分词后的文档
  7. * @author wuweifeng wrote on 2018/6/29.
  8. */
  9. public class FenCi {
  10. private FudanTokenizer tokenizer = new FudanTokenizer();
  11.  
  12. public void processFile() throws Exception {
  13. String filePath = this.getClass().getClassLoader().getResource("text/tlbb.txt").getPath();
  14. BufferedReader in = new BufferedReader(new FileReader(filePath));
  15.  
  16. File outfile = new File("/Users/wuwf/project/tlbb_t.txt");
  17. if (outfile.exists()) {
  18. outfile.delete();
  19. }
  20. FileOutputStream fop = new FileOutputStream(outfile);
  21.  
  22. // 构建FileOutputStream对象,文件不存在会自动新建
  23. String line = in.readLine();
  24. OutputStreamWriter writer = new OutputStreamWriter(fop, "UTF-8");
  25. while (line != null) {
  26. line = tokenizer.processSentence(line);
  27. writer.append(line);
  28. line = in.readLine();
  29. }
  30. in.close();
  31. writer.close(); // 关闭写入流,同时会把缓冲区内容写入文件
  32. fop.close(); // 关闭输出流,释放系统资源
  33. }
  34.  
  35. public static void main(String[] args) throws Exception {
  36. new FenCi().processFile();
  37. }
  38. }
  1. package com.tianyalei.nlp.tlbb;
  2.  
  3. import org.fnlp.ml.types.Dictionary;
  4. import org.fnlp.nlp.cn.tag.CWSTagger;
  5. import org.fnlp.nlp.corpus.StopWords;
  6. import org.fnlp.util.exception.LoadModelException;
  7.  
  8. import java.io.IOException;
  9. import java.util.List;
  10.  
  11. /**
  12. * @author wuweifeng wrote on 2018/6/29.
  13. */
  14. public class FudanTokenizer {
  15. private CWSTagger tag;
  16.  
  17. private StopWords stopWords;
  18.  
  19. public FudanTokenizer() {
  20. String path = this.getClass().getClassLoader().getResource("").getPath();
  21. System.out.println(path);
  22. try {
  23. tag = new CWSTagger(path + "models/seg.m");
  24. } catch (LoadModelException e) {
  25. e.printStackTrace();
  26. }
  27.  
  28. }
  29.  
  30. public String processSentence(String context) {
  31. return tag.tag(context);
  32. }
  33.  
  34. public String processSentence(String sentence, boolean english) {
  35. if (english) {
  36. tag.setEnFilter( true);
  37. }
  38. return tag.tag(sentence);
  39. }
  40.  
  41. public String processFile(String filename) {
  42. return tag.tagFile(filename);
  43. }
  44.  
  45. /**
  46. * 设置分词词典
  47. */
  48. public boolean setDictionary() {
  49. String dictPath = this.getClass().getClassLoader().getResource("models/dict.txt").getPath();
  50.  
  51. Dictionary dict;
  52. try {
  53. dict = new Dictionary(dictPath);
  54. } catch (IOException e) {
  55. return false;
  56. }
  57. tag.setDictionary(dict);
  58. return true;
  59. }
  60.  
  61. /**
  62. * 去除停用词
  63. */
  64. public List<String> flitStopWords(String[] words) {
  65. try {
  66. return stopWords.phraseDel(words);
  67. } catch (Exception e) {
  68. e.printStackTrace();
  69. return null;
  70. }
  71. }
  72.  
  73. }    

然后运行一下,过一会就得到了分词后的文档tlbb_t.txt,将分词后的拷贝到resource下,将来机器学的就是分词后的文档。

  1. package com.tianyalei.nlp.tlbb;
  2.  
  3. import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
  4. import org.deeplearning4j.models.word2vec.Word2Vec;
  5. import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
  6. import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
  7. import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
  8. import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
  9. import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
  10. import org.nd4j.linalg.io.ClassPathResource;
  11. import org.slf4j.Logger;
  12. import org.slf4j.LoggerFactory;
  13.  
  14. import java.io.File;
  15. import java.io.IOException;
  16. import java.util.Collection;
  17.  
  18. /**
  19. * @author wuweifeng wrote on 2018/6/29.
  20. */
  21. public class Tlbb {
  22. private static Logger log = LoggerFactory.getLogger(Tlbb.class);
  23.  
  24. public static void main(String[] args) throws IOException {
  25. String filePath = new ClassPathResource("text/tlbb_t.txt").getFile().getAbsolutePath();
  26. log.info( "Load & Vectorize Sentences....");
  27.  
  28. SentenceIterator iter = new BasicLineIterator(new File(filePath));
  29.  
  30. TokenizerFactory t = new DefaultTokenizerFactory();
  31. t.setTokenPreProcessor( new CommonPreprocessor());
  32.  
  33. log.info( "Building model....");
  34. Word2Vec vec = new Word2Vec.Builder().
  35. minWordFrequency( 5)
  36. .iterations( 1)
  37. .layerSize( 100)
  38. .seed( 42)
  39. .windowSize( 5)
  40. .iterate(iter)
  41. .tokenizerFactory(t)
  42. .build();
  43. log.info( "Fitting Word2Vec model....");
  44. vec.fit();
  45. log.info( "Writing word vectors to text file....");
  46.  
  47. // Write word vectors to file
  48. log.info( "Writing word vectors to text file....");
  49.  
  50. WordVectorSerializer.writeWordVectors(vec, "tlbb_vectors.txt");
  51. WordVectorSerializer.writeFullModel(vec, "tlbb_model.txt");
  52. String[] names = { "萧峰", "乔峰", "段誉", "虚竹", "王语嫣", "阿紫", "阿朱", "木婉清"};
  53. log.info( "Closest Words:");
  54.  
  55. for (String name : names) {
  56. System.out.println(name + ">>>>>>");
  57. Collection<String> lst = vec.wordsNearest(name, 10);
  58. System.out.println(lst);
  59. }
  60.  
  61.  
  62. }
  63.  
  64. }

代码和之前的demo区别博彩网站定制开发【大神源码论坛】dsluntan.com  【布丁源码论坛】budingbbs.com 企娥3393756370不大,运行后,就能看到这几个人的关联度最高的词了。

猜你喜欢

转载自www.cnblogs.com/wxydda/p/9267908.html