返回> 网站首页
lucene分类统计
yoours2025-04-18 18:46:31
简介一边听听音乐,一边写写文章。
一、简介
网上公开的lucene分类统计版本较老,不适合新版本lucene 9.4.1。
二、示例
public static void Test() throws Exception
{
// 实例化Analyzer分词器
Analyzer analyzer = new StandardAnalyzer();
//建立内存索引对象
Directory directory = FSDirectory.open(Paths.get("D:\\luc"));
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(directory, iwConfig);
writer.deleteAll();
writer.commit();
for (int i = 0; i < 100; ++i)
{
Document doc = new Document();
doc.add(new TextField("text", "Banana is sweet " + i, Field.Store.YES));
doc.add(new StoredField("other", "test100 " + i));
//doc.add(new TextField("catalog", "fruit", Field.Store.YES));
doc.add(new SortedDocValuesField("catalog", new BytesRef("fruit")));
int year = 2004 + i;
doc.add(new NumericDocValuesField("date", year * 10000 + 1111));
doc.add(new StoredField("date", year * 10000 + 1111));
writer.addDocument(doc);
}
for (int i = 0; i < 50; ++i)
{
Document doc = new Document();
doc.add(new TextField("text", "Juice is sweet " + i, Field.Store.YES));
doc.add(new StoredField("other", "test100 " + i));
//doc.add(new TextField("catalog", "drink", Field.Store.YES));
doc.add(new SortedDocValuesField("catalog", new BytesRef("drink")));
int year = 2004 + i;
doc.add(new NumericDocValuesField("date", year * 10000 + 1111));
doc.add(new StoredField("date", year * 10000 + 1111));
writer.addDocument(doc);
}
for (int i = 0; i < 25; ++i)
{
Document doc = new Document();
doc.add(new TextField("text", "Hankcs is here " + i, Field.Store.YES));
doc.add(new StoredField("other", "test100 " + i));
//doc.add(new TextField("catalog", "person", Field.Store.YES));
doc.add(new SortedDocValuesField("catalog", new BytesRef("person")));
int year = 2004 + i;
doc.add(new NumericDocValuesField("date", year * 10000 + 1111));
doc.add(new StoredField("date", year * 10000 + 1111));
writer.addDocument(doc);
}
writer.close();
//搜索过程**********************************
//实例化搜索器
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
String keyword = "sweet";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser("text", analyzer);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录并且分组
int topNGroups = 10; // 每页需要多少个组
int groupOffset = 0; // 起始的组
//Sort docSort = Sort.RELEVANCE; // groupSort用于对组进行排序,docSort用于对组内记录进行排序,多数情况下两者是相同的,但也可不同
// 增加按时间排序 - true 倒序 false正序
Sort docSort = new Sort(new SortField("date", SortField.Type.INT, false));
Sort groupSort = docSort;
int docOffset = 0; // 用于组内分页,起始的记录
int docsPerGroup = 10;// 每组返回多少条结果
boolean requiredTotalGroupCount = true; // 是否需要计算总的组的数量
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector(new TermGroupSelector("catalog"), groupSort, topNGroups);
boolean cacheScores = true;
double maxCacheRAMMB = 16.0;
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
searcher.search(query, cachedCollector);
Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset);
if (topGroups == null)
{
return;
}
Collector secondPassCollector = null;
boolean getMaxScores = true;
// 定义分组还原器:统计每个分组的文档数量
TopGroupsCollector c2 = new TopGroupsCollector(new TermGroupSelector("catalog"), topGroups, groupSort, docSort, docsPerGroup, getMaxScores);
// 是否需要计算一共有多少个分类,这一步是可选的
AllGroupsCollector allGroupsCollector = null;
if (requiredTotalGroupCount)
{
allGroupsCollector = new AllGroupsCollector(new TermGroupSelector("catalog"));
secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
} else {
secondPassCollector = c2;
}
if (cachedCollector.isCached()) {
// 被缓存的话,就用缓存
cachedCollector.replay(secondPassCollector);
} else {
// 超出缓存大小,重新执行一次查询
searcher.search(query, secondPassCollector);
}
int totalGroupCount = -1; // 所有组的数量
int totalHitCount = -1; // 所有满足条件的记录数
int totalGroupedHitCount = -1; // 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的)
if (requiredTotalGroupCount)
{
totalGroupCount = allGroupsCollector.getGroupCount();
}
System.out.println("一共匹配到多少个分类: " + totalGroupCount);
TopGroups groupsResult = c2.getTopGroups(docOffset);
totalHitCount = groupsResult.totalHitCount;
totalGroupedHitCount = groupsResult.totalGroupedHitCount;
System.out.println("groupsResult.totalHitCount:" + totalHitCount);
System.out.println("groupsResult.totalGroupedHitCount:" + totalGroupedHitCount);
int groupIdx = 0;
// 迭代组
for (GroupDocs groupDocs : groupsResult.groups)
{
groupIdx++;
System.out.println("group[" + groupIdx + "]:" + groupDocs.groupValue + " - " + ((BytesRef)groupDocs.groupValue).utf8ToString()); // 组的标识
System.out.println("group[" + groupIdx + "]:" + groupDocs.totalHits); // 组内的记录数
int docIdx = 0;
// 迭代组内的记录
for (ScoreDoc scoreDoc : groupDocs.scoreDocs)
{
docIdx++;
System.out.println("group[" + groupIdx + "][" + docIdx + "]:" + scoreDoc.doc + "/" + scoreDoc.score);
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("group[" + groupIdx + "][" + docIdx + "]:" + doc);
System.out.print(doc.getField("text").stringValue());
System.out.print(" , " + doc.getField("date").stringValue());
}
}
reader.close();
}