public class SimpleSortedSetFacetsExample { /*这里例子原来是讲index文件放在内存中,本人为了查看文件格式将其放在指定目录里 面*/ private Directory indexDir = null;//new RAMDirectory(); private final FacetsConfig config = new FacetsConfig(); /** Empty constructor */ public SimpleSortedSetFacetsExample() { try { indexDir = FSDirectory.open(Paths.get("D:\\index")); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** Build the example index. */ private void index() throws IOException { IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig( new WhitespaceAnalyzer()).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); // 这里保存文件格式用的是专门SortedSetDocValuesFacetField field,4.0+支持 doc.add(new SortedSetDocValuesFacetField("Author", "Bob")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2010")); //在build方法里面开始组织field信息,见code1: indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Lisa")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2010")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Lisa")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2012")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Susan")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "2012")); indexWriter.addDocument(config.build(doc)); doc = new Document(); doc.add(new SortedSetDocValuesFacetField("Author", "Frank")); doc.add(new SortedSetDocValuesFacetField("Publish Year", "1999")); indexWriter.addDocument(config.build(doc)); indexWriter.close(); } /** User runs a query and counts facets. */ private List<FacetResult> search() throws IOException { DirectoryReader indexReader = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(indexReader); SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader); // Aggregatses the facet counts FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc); // Retrieve results Facets facets = new SortedSetDocValuesFacetCounts(state, fc); List<FacetResult> results = new ArrayList<>(); results.add(facets.getTopChildren(10, "Author")); results.add(facets.getTopChildren(10, "Publish Year")); //在这里触发flush 操作 indexReader.close(); return results; } /** User drills down on 'Publish Year/2010'. */ private FacetResult drillDown() throws IOException { DirectoryReader indexReader = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(indexReader); SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader); // Now user drills down on Publish Year/2010: DrillDownQuery q = new DrillDownQuery(config); q.add("Publish Year", "2010"); FacetsCollector fc = new FacetsCollector(); FacetsCollector.search(searcher, q, 10, fc); // Retrieve results Facets facets = new SortedSetDocValuesFacetCounts(state, fc); FacetResult result = facets.getTopChildren(10, "Author"); indexReader.close(); return result; } /** Runs the search example. */ public List<FacetResult> runSearch() throws IOException { index(); return search(); } /** Runs the drill-down example. */ public FacetResult runDrillDown() throws IOException { index(); return drillDown(); } /** Runs the search and drill-down examples and prints the results. */ public static void main(String[] args) throws Exception { System.out.println("Facet counting example:"); System.out.println("-----------------------"); SimpleSortedSetFacetsExample example = new SimpleSortedSetFacetsExample(); List<FacetResult> results = example.runSearch(); System.out.println("Author: " + results.get(0)); System.out.println("Publish Year: " + results.get(0)); System.out.println("\n"); System.out.println("Facet drill-down example (Publish Year/2010):"); System.out.println("---------------------------------------------"); System.out.println("Author: " + example.runDrillDown()); } }
code1:
//FacetsConfig.build public Document build(TaxonomyWriter taxoWriter, Document doc) throws IOException { ...... //组织SortedSetDocValuesFacetField 信息 if (field.fieldType() == SortedSetDocValuesFacetField.TYPE) { SortedSetDocValuesFacetField facetField = (SortedSetDocValuesFacetField) field; FacetsConfig.DimConfig dimConfig = getDimConfig(facetField.dim); if (dimConfig.multiValued == false) { checkSeen(seenDims, facetField.dim); } /**这里做了统一处理,对于所有的 SortedSetDocValuesFacetField ,其 indexFieldName 统一为$facets,与用户在SortedSetDocValuesFacetField 中定义的完全无关,用户定义的名值对在SortedSetDocValuesFacetField 映射应该是dim,label, 如new SortedSetDocValuesFacetField("Author", "Bob") 那么其dim应该是Author,label是Bob*/ String indexFieldName = dimConfig.indexFieldName; List<SortedSetDocValuesFacetField> fields = dvByField.get(indexFieldName); /** fields 这里存储的是 所有的SortedSetDocValuesFacetField , 并且以统一的以indexFieldName 保存在dvByField */ if (fields == null) { fields = new ArrayList<>(); dvByField.put(indexFieldName, fields); } //完成文档的fields的初步处理 fields.add(facetField); } ....... processFacetFields(taxoWriter, byField, result); //这里专门对SortedSetDocValuesFacetField 名值对二次处理,具体见下: processSSDVFacetFields(dvByField, result); processAssocFacetFields(taxoWriter, assocByField, result); ............ } //FacetsConfig.processSSDVFacetFields private void private void processSSDVFacetFields(Map<String,List<SortedSetDocValuesFacetField>> byField, Document doc) throws IOException { //System.out.println("process SSDV: " + byField); for(Map.Entry<String,List<SortedSetDocValuesFacetField>> ent : byField.entrySet()) { String indexFieldName = ent.getKey(); //System.out.println(" field=" + indexFieldName); // 循环文档中所有的SortedSetDocValuesFacetField for(SortedSetDocValuesFacetField facetField : ent.getValue()) { FacetLabel cp = new FacetLabel(facetField.dim, facetField.label); /*这里是将fields 中 dim 和 label 连起来中间用"\u001f" 做单元分隔符做区别 , 比如Author\u001fBob*/ String fullPath = pathToString(cp.components, cp.length); //System.out.println("add " + fullPath); // For facet counts: //将整个fullPath 做为值存储。 doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath))); // For drill-down: doc.add(new StringField(indexFieldName, fullPath, Field.Store.NO)); doc.add(new StringField(indexFieldName, facetField.dim, Field.Store.NO)); } } }
以上完成了SortedSetDocValuesField的一个重组,在分析写进文件系统中,有几个问题先问自己?
上面的代码分析可以看出最后要存储进去的一定是一个dim+label完整的字符串。
1.DocValues 如果是被认为是每个文档的相应的field值,那么存储的时候是按照文档存储的?比如文档1:这个$facets值是E....文档2,这个$facets值 C 存储?那么如果文档间如果有相同的filed值是按照上面重复存储还是只存一个值,如文档1:这个$facets值是E,文档2如果也是相同值E,这个$facets值是存地址?因为A已经存储在文档1中,是否有类似地址指向指向E值?(瞎猜?!),如果按照这样设计文件如何能改善facet统计,这和以前存储又何区别?
2.如果存储不是按照1上面所说的,还有一种方案就是所有的文档在同一个field值给所有文档排序去重共享(SortedSet),那么上面例子存储值就是C,E 对于文档1,2共享,但是这样又带来一个新问题,我必须知道文档1有过什么值,文档2 有过什么值。这又回到1按照文档存储值的老思路上来?!