狠狠撸

lucene2.4 gong on

我把 lucene2.4 发行包的一个例子改的更简单些，仅供参考，其中在 eclipse
中运行遇到中文乱码问题，这些在代码中会有体现。

完成这个例子后，我顺便试了一下 qieqie 的庖丁解牛中文分词器，很好用，
而且分发包中有个简单的说明文档。

Java 代码

1. package tutorial;
2.
3. import java.io.File;
4. import java.io.FileInputStream;
5. import java.io.FileNotFoundException;
6. import java.io.IOException;
7. import java.io.InputStreamReader;
8.
9. import org.apache.lucene.analysis.standard.StandardAnalyzer;
10.import org.apache.lucene.document.Document;
11.import org.apache.lucene.document.Field;
12.import org.apache.lucene.index.IndexWriter;
13.
14.public class IndexFiles {
15.
16. public static void main(String[] args) {
17. long start = System.currentTimeMillis();
18. try {
19. IndexWriter writer = new IndexWriter("index",
20. new StandardAnalyzer(), true,
21. IndexWriter.MaxFieldLength.LIMITED);
22. indexDocs(writer, new File("data"));
23. writer.optimize();
24. writer.close();
25. System.out.println("用时：" + (System.currentTimeMi
llis() - start)
26. + " 毫秒");
27. } catch (IOException e) {
28. e.printStackTrace();

29. }
30. }
31.
32. static void indexDocs(IndexWriter writer, File file) throws
IOException {
33. if (file.canRead()) {
34. if (file.isDirectory()) {
35. String[] files = file.list();
36. if (files != null) {
37. for (int i = 0; i < files.length; i++) {
38. indexDocs(writer, new File(file, files[
i]));
39. }
40. }
41. } else {
42. System.out.println("添加 " + file);
43. try {
44. //针对参数文件建立索引文档
45. Document doc = new Document();
46. //Field.Index.NOT_ANALYZED 文件名称建立索
引，但不分词
47. doc.add(new Field("filename", file.getCanon
icalPath(),
48. Field.Store.YES, Field.Index.NOT_AN
ALYZED));
49. doc.add(new Field("contents",
50. new InputStreamReader(new FileInput
Stream(file.getCanonicalPath()), "utf-8")));
51. //在 writer 中加入此文档
52. writer.addDocument(doc);
53. //抛弃了原来 demo 中的如下做法，因为这样不能
设定字符编码，当出现因为编码为题查不到
54. //中文字符时，便束手无策。
55. //writer.addDocument(FileDocument.Document(
file));
56. } catch (FileNotFoundException fnfe) {
57. ;
58. }
59. }
60. }
61. }
62.
63.}

Java 代码

2.
3. import java.io.BufferedReader;
4. import java.io.IOException;
5. import java.io.InputStreamReader;
6.
7. import org.apache.lucene.analysis.Analyzer;
8. import org.apache.lucene.analysis.standard.StandardAnalyzer;
9. import org.apache.lucene.document.Document;
10.import org.apache.lucene.index.IndexReader;
11.import org.apache.lucene.queryParser.QueryParser;
12.import org.apache.lucene.search.IndexSearcher;
13.import org.apache.lucene.search.Query;
14.import org.apache.lucene.search.ScoreDoc;
15.import org.apache.lucene.search.Searcher;
16.import org.apache.lucene.search.TopDocCollector;
17.
18.public class SearchFiles {
19.
20. public static void main(String[] args) throws Exception {

21.
22. IndexReader reader = IndexReader.open("index");
23.
24. Searcher searcher = new IndexSearcher(reader);
25. Analyzer analyzer = new StandardAnalyzer();
26.
27. BufferedReader in = new BufferedReader(new InputStreamR
eader(System.in));
28.
29. String field = "contents";
30. QueryParser parser = new QueryParser(field, analyzer);

31.
32. String queries = null;
33. Query query = null;
34. while (true) {
35. if (queries == null) {
36. System.out.print("输入查询词(quit or exit 退出):
");

37. }
38.
39. String line = in.readLine();
40. if (line == null || line.length() == -1) {
41. continue;
42. } else if (line.equals("quit") || line.equals("exit
")) {
43. break;
44. }
45. line = line.trim();
46.
47. query = parser.parse(line);
48. System.out.println("查询: " + query.toString(field)
);
49.
50. long start = System.currentTimeMillis();
51. doPagingSearch(searcher, query, 5);
52. System.out.println("用时：" + (System.currentTimeMi
llis() - start)
53. + " 毫秒");
54. }
55.
56. reader.close();
57. }
58.
59. public static void doPagingSearch(Searcher searcher, Query
query,
60. int hitsPerPage) throws IOException {
61.
62. TopDocCollector collector = new TopDocCollector(5 * hit
sPerPage);
63. searcher.search(query, collector);
64. ScoreDoc[] hits = collector.topDocs().scoreDocs;
65.
66. int numTotalHits = collector.getTotalHits();
67. System.out.println("符合查询词的文件数：" + numTotalHit
s);
68.
69. int start = 0;
70. int end = Math.min(numTotalHits, hitsPerPage);
71.
72. for (int i = start; i < end; i++) {
73. Document doc = searcher.doc(hits[i].doc);
74. String path = doc.get("filename");

75. if (path != null) {
76. System.out.println((i + 1) + ". " + path);
77. } else {
78. System.out.println((i + 1) + ". " + "沒有这个目
录指定的文件");
79. }
80. }
81. }
82.
83.}

也许我的进度有点缓慢，因为这两天有点事，且总是心神不宁的。今天去火车站
买票了，给我的感觉是跟我平常买票没什么区别，我很容易就买到票了，而且
还有座位。也许是因为家近的原因吧。或者是我买的日期不是一个高峰点。

2009-01-12

下面是从庖丁文档里粘出来的，并为了适应版本 2.4 做了部分修改的代码：

Java 代码

2.
3. import net.paoding.analysis.analyzer.PaodingAnalyzer;
4.
5. import org.apache.lucene.analysis.Analyzer;
6. import org.apache.lucene.analysis.TokenStream;
7. import org.apache.lucene.document.Document;
8. import org.apache.lucene.document.Field;
9. import org.apache.lucene.index.IndexReader;
10.import org.apache.lucene.index.IndexWriter;
11.import org.apache.lucene.index.TermPositionVector;
12.import org.apache.lucene.queryParser.QueryParser;
13.import org.apache.lucene.search.IndexSearcher;
14.import org.apache.lucene.search.Query;
15.import org.apache.lucene.search.ScoreDoc;
16.import org.apache.lucene.search.Searcher;
17.import org.apache.lucene.search.TopDocCollector;
18.import org.apache.lucene.search.highlight.Formatter;
19.import org.apache.lucene.search.highlight.Highlighter;

20.import org.apache.lucene.search.highlight.QueryScorer;
21. import org.apache.lucene.search.highlight.TokenGroup;
22.import org.apache.lucene.search.highlight.TokenSources;
23.
24.public class PaodingExample {
25.
26. public static void main(String[] args) throws Exception {

27. String IDNEX_PATH = "index";
28.
29. // 获取 Paoding 中文分词器
30. Analyzer analyzer = new PaodingAnalyzer();
31.
32. // 建立索引
33. IndexWriter writer = new IndexWriter(IDNEX_PATH, analyz
er, true, IndexWriter.MaxFieldLength.LIMITED);
34. Document doc = new Document();
35. Field field = new Field("content", "你好，世界!", Field
.Store.YES,
36. Field.Index.ANALYZED, Field.TermVector.WITH_POS
ITIONS_OFFSETS);
37. doc.add(field);
38. writer.addDocument(doc);
39. writer.close();
40.
41. System.out.println("Indexed success!");
42.
43. // 检索
44. IndexReader reader = IndexReader.open(IDNEX_PATH);
45. QueryParser parser = new QueryParser("content", analyze
r);
46. Query query = parser.parse("你好");
47. Searcher searcher = new IndexSearcher(reader);
48. TopDocCollector collector = new TopDocCollector(5);
49. searcher.search(query, collector);
50. ScoreDoc[] hits = collector.topDocs().scoreDocs;
51.
52. if (collector.getTotalHits() == 0) {
53. System.out.println("hits.length=0");
54. System.exit(0);
55. }
56.
57. Document doc2 = searcher.doc(hits[0].doc);
58. // 高亮处理

59. String text = doc2.get("content");
60. TermPositionVector tpv = (TermPositionVector) reader.ge
tTermFreqVector(
61. 0, "content");
62. TokenStream ts = TokenSources.getTokenStream(tpv);
63. Formatter formatter = new Formatter() {
64. public String highlightTerm(String srcText, TokenGr
oup g) {
65. if (g.getTotalScore() <= 0) {
66. return srcText;
67. }
68. return "<b>" + srcText + "</b>";
69. }
70. };
71.
72. Highlighter highlighter = new Highlighter(formatter, ne
w QueryScorer(
73. query));
74. String result = highlighter.getBestFragments(ts, text,
5, "…");
75. System.out.println("result:nt" + result);
76. reader.close();
77.
78. }
79.
80.}

? 02:44
? 浏览 (931)
? 评论 (5)
? 分类: 检索/P2P/分布式
? 收藏
? 相关推荐

评论

5 楼每天看看 2009-04-29 引用
//Field.Index.NOT_ANALYZED 文件名称建立索引，但不分词
doc.add(new Field("filename",
file.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED));

这样建立索引，是不是我在搜索时可以

String field = "filename";
QueryParser parser = new QueryParser(field, analyzer);
query = parser.parse(filePath);

这样搜索，结果应该有一条记录。
4 楼 zhyt710 2009-02-07 引用
当然不是，你完全可以逐个遍历 int numTotalHits =
collector.getTotalHits(); 个符合条件的结果。但是对于大数据量的系统，一
般出现的结果数量非常的巨大，所以采取分页显示的方式。就是说把打印结果放
在一个 while 循环里，每次循环都让 start 加一页数量的大小，只要 start 的
值小于 numTotalHits。从而实现分页显示。我在这里是为了尽可能的简化程序，
使读者更容易入门
xuganggogo 写道

Deprecated. Hits will be removed in Lucene 3.0.Instead e. g.
TopDocCollector and TopDocs can be used:   TopDocCollector
collector = new TopDocCollector(hitsPerPage);  
searcher.search(query, collector);   ScoreDoc[] hits =
collector.topDocs().scoreDocs;   for (int i = 0; i <
hits.length; i++) {     int docId =
hits[i].doc;     Document d =
searcher.doc(docId);     // do something with
current hit 这里代替 hits 的方法，只能对 hitsPerPage 个结果进行操作。难道
hitsPerPage 是写死的吗？如果要遍历所有结果集，该如何做？

3 楼 xuganggogo 2009-02-05 引用
Deprecated. Hits will be removed in Lucene 3.0.

Instead e. g. TopDocCollector and TopDocs can be used:

TopDocCollector collector = new TopDocCollector(hitsPerPage);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);

// do something with current hit
这里代替 hits 的方法，只能对 hitsPerPage 个结果进行操作。
难道 hitsPerPage 是写死的吗？
如果要遍历所有结果集，该如何做？
2 楼 zhyt710 2009-02-04 引用
并为了适应版
xuganggogo 写道

这个能用作索引 HTML 吗？

当然可以，只要是文本文件都可以。对于不是纯文本文件的如 pdf,excel 等，只
要进行相应的解析成文本后，便可建立索引
1 楼 xuganggogo 2009-02-04 引用
这个能用作索引 HTML 吗？

狠狠撸

Lucene2 4 Demo

More Related Content

What's hot (11)

Viewers also liked (6)

Similar to Lucene2 4 Demo (20)

More from yiditushe (20)

Lucene2 4 Demo