|
| 1 | +package com.genexus.CA.search; |
| 2 | + |
| 3 | +import java.io.BufferedReader; |
| 4 | +import java.io.File; |
| 5 | +import java.io.FileInputStream; |
| 6 | +import java.io.FileReader; |
| 7 | +import java.io.IOException; |
| 8 | +import java.util.Iterator; |
| 9 | +import java.util.List; |
| 10 | +import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| 11 | +import org.apache.lucene.document.Document; |
| 12 | +import org.apache.lucene.document.Field; |
| 13 | +import org.apache.lucene.document.Field.Index; |
| 14 | +import org.apache.lucene.document.Field.Store; |
| 15 | +import org.apache.lucene.index.IndexReader; |
| 16 | +import org.apache.lucene.index.IndexWriter; |
| 17 | +import org.apache.lucene.index.Term; |
| 18 | +import org.apache.lucene.search.BooleanQuery; |
| 19 | +import org.apache.lucene.search.Hits; |
| 20 | +import org.apache.lucene.search.IndexSearcher; |
| 21 | +import org.apache.lucene.search.TermQuery; |
| 22 | +import org.apache.lucene.search.BooleanClause.Occur; |
| 23 | +import org.apache.pdfbox.Loader; |
| 24 | +import org.apache.pdfbox.pdmodel.PDDocument; |
| 25 | +import org.apache.pdfbox.text.PDFTextStripper; |
| 26 | +import org.apache.pdfbox.text.PDFTextStripperByArea; |
| 27 | +import org.apache.poi.xwpf.usermodel.XWPFDocument; |
| 28 | +import org.apache.poi.xwpf.usermodel.XWPFParagraph; |
| 29 | + |
| 30 | +public final class Indexer { |
| 31 | + private String indexDirectory = "."; |
| 32 | + private static final int IDX = 1; |
| 33 | + private static final int DLT = 2; |
| 34 | + |
| 35 | + protected Indexer(String directory) { |
| 36 | + this.indexDirectory = directory; |
| 37 | + if (!this.indexExists(directory)) { |
| 38 | + try { |
| 39 | + this.indexDirectory = directory; |
| 40 | + IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true); |
| 41 | + writer.close(); |
| 42 | + } catch (Exception var3) { |
| 43 | + Logger.print(var3.toString()); |
| 44 | + } |
| 45 | + } |
| 46 | + |
| 47 | + } |
| 48 | + |
| 49 | + protected void addContent(String uri, String lang, String title, String summary, byte fromFile, String body, String filePath) { |
| 50 | + Document doc = null; |
| 51 | + doc = new Document(); |
| 52 | + String content = ""; |
| 53 | + if (fromFile == 1) { |
| 54 | + try { |
| 55 | + if (this.isMicrosoftExtension(filePath)) { |
| 56 | + FileInputStream file = new FileInputStream(filePath); |
| 57 | + XWPFDocument reader = new XWPFDocument(file); |
| 58 | + List<XWPFParagraph> data = reader.getParagraphs(); |
| 59 | + |
| 60 | + XWPFParagraph p; |
| 61 | + for(Iterator var14 = data.iterator(); var14.hasNext(); content = content + p.getText()) { |
| 62 | + p = (XWPFParagraph)var14.next(); |
| 63 | + } |
| 64 | + } else if (this.isPdfExtension(filePath)) { |
| 65 | + PDDocument document = Loader.loadPDF(new File(filePath)); |
| 66 | + new PDFTextStripperByArea(); |
| 67 | + PDFTextStripper tStripper = new PDFTextStripper(); |
| 68 | + content = content + tStripper.getText(document); |
| 69 | + } else if (this.isTxtExtension(filePath)) { |
| 70 | + File txt = new File(filePath); |
| 71 | + |
| 72 | + String st; |
| 73 | + for(BufferedReader br = new BufferedReader(new FileReader(txt)); (st = br.readLine()) != null; content = content + st) { |
| 74 | + } |
| 75 | + } |
| 76 | + } catch (IOException var16) { |
| 77 | + var16.printStackTrace(); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + if (doc != null) { |
| 82 | + if (this.documentExists(uri, lang)) { |
| 83 | + this.indexOperation(2, lang, (Document)null, uri.toLowerCase()); |
| 84 | + } |
| 85 | + |
| 86 | + doc.add(new Field("uri", uri, Store.YES, Index.UN_TOKENIZED)); |
| 87 | + doc.add(new Field("content", content, Store.YES, Index.TOKENIZED)); |
| 88 | + |
| 89 | + try { |
| 90 | + this.indexOperation(1, lang, doc, (String)null); |
| 91 | + } catch (Exception var15) { |
| 92 | + Logger.print(var15.toString()); |
| 93 | + } |
| 94 | + } |
| 95 | + |
| 96 | + } |
| 97 | + |
| 98 | + protected void deleteContent(String uri) { |
| 99 | + try { |
| 100 | + this.indexOperation(2, (String)null, (Document)null, uri.toLowerCase()); |
| 101 | + } catch (Exception var3) { |
| 102 | + Logger.print(var3.toString()); |
| 103 | + } |
| 104 | + |
| 105 | + } |
| 106 | + |
| 107 | + protected synchronized void indexOperation(int op, String lang, Document doc, String uri) { |
| 108 | + switch(op) { |
| 109 | + case 1: |
| 110 | + try { |
| 111 | + IndexWriter writer = new IndexWriter(this.getIndexDirectory(), AnalyzerManager.getAnalyzer(lang), false); |
| 112 | + writer.addDocument(doc); |
| 113 | + writer.optimize(); |
| 114 | + writer.close(); |
| 115 | + } catch (Exception var9) { |
| 116 | + Logger.print(var9.toString()); |
| 117 | + } |
| 118 | + break; |
| 119 | + case 2: |
| 120 | + try { |
| 121 | + Term term = null; |
| 122 | + int docId = 0; |
| 123 | + if (lang == null) { |
| 124 | + term = new Term("uri", uri); |
| 125 | + } else { |
| 126 | + docId = this.getDocumentId(uri, lang); |
| 127 | + } |
| 128 | + |
| 129 | + IndexReader reader = IndexReader.open(this.getIndexDirectory()); |
| 130 | + if (lang == null) { |
| 131 | + reader.deleteDocuments(term); |
| 132 | + } else if (docId != -1) { |
| 133 | + reader.deleteDocument(docId); |
| 134 | + } |
| 135 | + |
| 136 | + reader.close(); |
| 137 | + } catch (Exception var8) { |
| 138 | + Logger.print(var8.toString()); |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + } |
| 143 | + |
| 144 | + public String getIndexDirectory() { |
| 145 | + return this.indexDirectory; |
| 146 | + } |
| 147 | + |
| 148 | + private boolean indexExists(String dir) { |
| 149 | + try { |
| 150 | + new IndexSearcher(dir); |
| 151 | + return true; |
| 152 | + } catch (IOException var3) { |
| 153 | + return false; |
| 154 | + } |
| 155 | + } |
| 156 | + |
| 157 | + private boolean documentExists(String uri, String lang) { |
| 158 | + boolean value = false; |
| 159 | + |
| 160 | + try { |
| 161 | + IndexSearcher searcher = new IndexSearcher(this.indexDirectory); |
| 162 | + BooleanQuery query = new BooleanQuery(); |
| 163 | + query.add(new TermQuery(new Term("uri", uri)), Occur.MUST); |
| 164 | + query.add(new TermQuery(new Term("language", lang)), Occur.MUST); |
| 165 | + Hits hits = searcher.search(query); |
| 166 | + searcher.close(); |
| 167 | + if (hits.length() > 0) { |
| 168 | + value = true; |
| 169 | + } |
| 170 | + } catch (IOException var7) { |
| 171 | + Logger.print(var7.toString()); |
| 172 | + } |
| 173 | + |
| 174 | + return value; |
| 175 | + } |
| 176 | + |
| 177 | + private int getDocumentId(String uri, String lang) { |
| 178 | + int value = -1; |
| 179 | + |
| 180 | + try { |
| 181 | + IndexSearcher searcher = new IndexSearcher(this.indexDirectory); |
| 182 | + BooleanQuery query = new BooleanQuery(); |
| 183 | + query.add(new TermQuery(new Term("uri", uri)), Occur.MUST); |
| 184 | + query.add(new TermQuery(new Term("language", lang)), Occur.MUST); |
| 185 | + Hits hits = searcher.search(query); |
| 186 | + if (hits.length() > 0) { |
| 187 | + value = hits.id(0); |
| 188 | + } |
| 189 | + |
| 190 | + searcher.close(); |
| 191 | + } catch (IOException var7) { |
| 192 | + Logger.print(var7.toString()); |
| 193 | + } |
| 194 | + |
| 195 | + return value; |
| 196 | + } |
| 197 | + |
| 198 | + private boolean isMicrosoftExtension(String filePath) { |
| 199 | + return filePath.endsWith(".doc") || filePath.endsWith(".docx") || filePath.endsWith(".xls") || filePath.endsWith(".xlsx") || filePath.endsWith(".ppt") || filePath.endsWith(".pptx"); |
| 200 | + } |
| 201 | + |
| 202 | + private boolean isPdfExtension(String filePath) { |
| 203 | + return filePath.endsWith(".pdf"); |
| 204 | + } |
| 205 | + |
| 206 | + private boolean isTxtExtension(String filePath) { |
| 207 | + return filePath.endsWith(".txt") || filePath.endsWith(".html"); |
| 208 | + } |
| 209 | +} |
0 commit comments