diff --git a/gxflowfulltextsearch/pom.xml b/gxflowfulltextsearch/pom.xml new file mode 100644 index 000000000..ea5c5856d --- /dev/null +++ b/gxflowfulltextsearch/pom.xml @@ -0,0 +1,63 @@ + + + 4.0.0 + + + com.genexus + parent + ${revision}${changelist} + + + gxflowfulltextsearch + GXflow FullText Search + + + + org.apache.commons + commons-collections4 + ${commons.collections4.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + org.apache.lucene + lucene-core + ${lucene.version} + + + org.apache.poi + poi + ${poi.version} + + + org.apache.poi + poi-ooxml + ${poi.version} + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + + GXflowFullTextSearch + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.0 + + + + + diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java new file mode 100644 index 000000000..d8df364a1 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java @@ -0,0 +1,25 @@ +package com.genexus.CA.search; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +public class AnalyzerManager { + private static final Map ANALYZERS = new ConcurrentHashMap<>(); + + static { + ANALYZERS.put("default", new StandardAnalyzer()); + // In the future, when the Lucene version is updated, specific analyzers for different languages can be added here. + // For example, for Spanish: + // ANALYZERS.put("es", new org.apache.lucene.analysis.es.SpanishAnalyzer()); + } + + public static Analyzer getAnalyzer(String lang) { + if (lang == null || lang.trim().isEmpty()) { + return ANALYZERS.get("default"); + } + return ANALYZERS.getOrDefault(lang, ANALYZERS.get("default")); + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java new file mode 100644 index 000000000..f2f26768f --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java @@ -0,0 +1,20 @@ +package com.genexus.CA.search; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class IndexManager { + private static final Map INDEXERS = new ConcurrentHashMap<>(); + + public static void addContent(String dir, String uri, String lang, String title, String summary, byte fromFile, String body, String filePath) { + getIndexer(dir).addContent(uri, lang, title, summary, fromFile, body, filePath); + } + + public static void deleteContent(String dir, String uri) { + getIndexer(dir).deleteContent(uri); + } + + private static Indexer getIndexer(String dir) { + return INDEXERS.computeIfAbsent(dir, Indexer::new); + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java new file mode 100644 index 000000000..f4b982ac4 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java @@ -0,0 +1,259 @@ +package com.genexus.CA.search; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; + +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; + +public final class Indexer { + private String indexDirectory = "."; + private static final int OPERATION_INDEX = 1; + private static final int OPERATION_DELETE = 2; + + private static final Logger logger = LogManager.getLogger(Indexer.class); + + Indexer(String directory) { + this.indexDirectory = normalizeIndexDirectory(directory); + if (!this.indexExists(this.indexDirectory)) { + try { + IndexWriter writer = new IndexWriter(this.indexDirectory, new StandardAnalyzer(), true); + writer.close(); + } catch (Exception e) { + logger.error("Error creating index directory: {}", this.indexDirectory, e); + } + } + + } + + void addContent(String uri, String lang, String title, String summary, byte fromFile, String body, String filePath) { + Document doc = new Document(); + StringBuilder contentBuilder = new StringBuilder(); + boolean fileContentRead = false; + String normalizedUri = normalizeUri(uri); + String normalizedLang = normalizeLang(lang); + + if (fromFile == 1 && filePath != null && !filePath.trim().isEmpty()) { + String lowerFilePath = filePath.toLowerCase(); + try { + if (this.isDocxExtension(lowerFilePath)) { + try (FileInputStream file = new FileInputStream(filePath); XWPFDocument reader = new XWPFDocument(file)) { + for (XWPFParagraph p : reader.getParagraphs()) { + contentBuilder.append(p.getText()).append(" "); + } + fileContentRead = true; + } + } else if (this.isPdfExtension(lowerFilePath)) { + try (PDDocument document = Loader.loadPDF(new File(filePath))) { + PDFTextStripper tStripper = new PDFTextStripper(); + contentBuilder.append(tStripper.getText(document)); + fileContentRead = true; + } + } else if (this.isTxtExtension(lowerFilePath)) { + contentBuilder.append(readTextFile(filePath)); + fileContentRead = true; + } + } catch (IOException e) { + logger.error("Error reading file content from: {}", filePath, e); + } + } + + if (body != null && !body.isEmpty() && !fileContentRead) { + contentBuilder.append(body); + } + + String content = contentBuilder.toString(); + + this.indexOperation(OPERATION_DELETE, normalizedLang, null, normalizedUri); + + doc.add(new Field("uri", normalizedUri, Store.YES, Index.UN_TOKENIZED)); + doc.add(new Field("language", normalizedLang, Store.YES, Index.UN_TOKENIZED)); + doc.add(new Field("title", title == null ? "" : title, Store.YES, Index.TOKENIZED)); + doc.add(new Field("summary", summary == null ? "" : summary, Store.YES, Index.TOKENIZED)); + doc.add(new Field("content", content, Store.YES, Index.TOKENIZED)); + + try { + this.indexOperation(OPERATION_INDEX, normalizedLang, doc, null); + } catch (Exception e) { + logger.error("Error indexing content. uri={}, lang={}", normalizedUri, normalizedLang, e); + } + } + + void deleteContent(String uri) { + try { + this.indexOperation(OPERATION_DELETE, null, null, normalizeUri(uri)); + } catch (Exception e) { + logger.error("Error deleting content. uri={}", uri, e); + } + + } + + private synchronized void indexOperation(int op, String lang, Document doc, String uri) { + switch(op) { + case OPERATION_INDEX: + try { + IndexWriter writer = new IndexWriter(this.getIndexDirectory(), AnalyzerManager.getAnalyzer(lang), false); + writer.addDocument(doc); + // writer.optimize(); // This is a costly operation and should not be done for every document. + writer.close(); + } catch (Exception e) { + logger.error("Error indexing document. uri={}, lang={}", uri, lang, e); + } + break; + case OPERATION_DELETE: + IndexReader reader = null; + try { + Term term = null; + int docId = 0; + if (lang == null) { + term = new Term("uri", uri); + } else { + docId = this.getDocumentId(uri, lang); + } + + reader = IndexReader.open(this.getIndexDirectory()); + if (lang == null) { + reader.deleteDocuments(term); + } else if (docId != -1) { + reader.deleteDocument(docId); + } + + } catch (Exception e) { + logger.error("Error deleting document. uri={}, lang={}", uri, lang, e); + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + logger.error("Error closing IndexReader", e); + } + } + } + } + + } + + public String getIndexDirectory() { + return this.indexDirectory; + } + + private String normalizeIndexDirectory(String dir) { + if (dir == null || dir.trim().isEmpty()) { + return "."; + } + return new File(dir).getAbsolutePath(); + } + + private boolean indexExists(String dir) { + try { + new IndexSearcher(dir); + return true; + } catch (IOException e) { + return false; + } + } + + private int getDocumentId(String uri, String lang) { + int documentId = -1; + + try { + Hits hits = this.getHits(uri, lang); + if (hits.length() > 0) { + documentId = hits.id(0); + } + } catch (IOException e) { + logger.error("Error getting document id. uri={}, lang={}", uri, lang, e); + } + + return documentId; + } + + private boolean isDocxExtension(String filePath) { + return filePath.toLowerCase().endsWith(".docx"); + } + + private Hits getHits(String uri, String lang) { + IndexSearcher searcher = null; + Hits hits = null; + try { + searcher = new IndexSearcher(this.indexDirectory); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("uri", uri)), Occur.MUST); + if (lang != null && !lang.trim().isEmpty()) { + query.add(new TermQuery(new Term("language", lang)), Occur.MUST); + } + hits = searcher.search(query); + } catch (IOException e) { + logger.error("Error searching hits. uri={}, lang={}", uri, lang, e); + } finally { + if (searcher != null) { + try { + searcher.close(); + } catch (IOException e) { + logger.error("Error closing IndexSearcher", e); + } + } + } + + return hits; + } + + private String normalizeUri(String uri) { + if (uri == null) { + return ""; + } + return uri.trim().toLowerCase(); + } + + private String normalizeLang(String lang) { + if (lang == null) { + return ""; + } + return lang.trim().toLowerCase(); + } + + private String readTextFile(String filePath) throws IOException { + StringBuilder builder = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(Files.newInputStream(Paths.get(filePath)), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + builder.append(line).append(' '); + } + } + return builder.toString(); + } + + private boolean isPdfExtension(String filePath) { + return filePath.toLowerCase().endsWith(".pdf"); + } + + private boolean isTxtExtension(String filePath) { + String lowerFilePath = filePath.toLowerCase(); + return lowerFilePath.endsWith(".txt") || lowerFilePath.endsWith(".html"); + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java new file mode 100644 index 000000000..676627357 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java @@ -0,0 +1,131 @@ +package com.genexus.CA.search; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.MultiFieldQueryParser; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; + +public class Searcher { + private static final Logger logger = LogManager.getLogger(Searcher.class); + + private static String escapeXml(String text) { + if (text == null) { + return ""; + } + return text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\"", """) + .replace("'", "'"); + } + + public static String search(String dir, String lang, String query, int maxResults, int from) { + StringBuilder buff = new StringBuilder(); + long startTime = System.currentTimeMillis(); + + if (from < 0) { + logger.warn("Search 'from' cannot be negative. Using 0 instead. from={}", from); + from = 0; + } + if (maxResults < 0) { + logger.warn("Search 'maxResults' cannot be negative. Using 0 instead. maxResults={}", maxResults); + maxResults = 0; + } + + if (!indexExists(dir)) { + buff.append(""); + return buff.toString(); + } + + IndexSearcher searcher = null; + try { + searcher = new IndexSearcher(dir); + String[] fields = new String[]{"title", "content", "summary"}; + Occur[] clauses = new Occur[]{Occur.SHOULD, Occur.SHOULD, Occur.SHOULD}; + + Query q; + try { + q = MultiFieldQueryParser.parse(query, fields, clauses, AnalyzerManager.getAnalyzer(lang)); + } catch (ParseException e) { + try { + String escapedQuery = QueryParser.escape(query); + q = MultiFieldQueryParser.parse(escapedQuery, fields, clauses, AnalyzerManager.getAnalyzer(lang)); + logger.warn("Query had invalid syntax. Escaped version was used: {}", escapedQuery, e); + } catch (ParseException escapedException) { + logger.warn("Could not parse query, falling back to TermQuery: " + query, escapedException); + q = new TermQuery(new Term("content", query)); + } + } + + if (lang != null && !lang.trim().isEmpty() && !"IND".equalsIgnoreCase(lang)) { + Query q2 = new TermQuery(new Term("language", lang)); + BooleanQuery bq = new BooleanQuery(); + bq.add(q, Occur.MUST); + bq.add(q2, Occur.MUST); + q = bq; + } + + Hits hits = searcher.search(q); + int totalHits = hits.length(); + + long endTime = System.currentTimeMillis(); + String time = String.valueOf(endTime - startTime); + + buff.append(""); + buff.append(""); + + int end = Math.min(totalHits, from + maxResults); + for (int i = from; i < end; i++) { + buff.append(""); + Document doc = hits.doc(i); + String uri = doc.getField("uri").stringValue(); + buff.append("").append(escapeXml(uri)).append(""); + buff.append(""); + } + } catch (Exception e) { + logger.error("Error during search", e); + // Return an empty but valid XML in case of error + buff.setLength(0); // Clear buffer + buff.append(""); + return buff.toString(); + } finally { + if (searcher != null) { + try { + searcher.close(); + } catch (Exception e) { + logger.error("Error closing IndexSearcher", e); + } + } + } + + buff.append(""); + return buff.toString(); + } + + private static boolean indexExists(String dir) { + IndexSearcher searcher = null; + try { + searcher = new IndexSearcher(dir); + return true; + } catch (Exception e) { + return false; + } finally { + if (searcher != null) { + try { + searcher.close(); + } catch (Exception e) { + logger.warn("Error closing IndexSearcher during indexExists check", e); + } + } + } + } +} diff --git a/gxsearch/pom.xml b/gxsearch/pom.xml index 51abb16ea..2c4ba8012 100644 --- a/gxsearch/pom.xml +++ b/gxsearch/pom.xml @@ -32,17 +32,17 @@ org.apache.lucene lucene-core - 2.2.0 + ${lucene.version} org.apache.lucene lucene-highlighter - 2.2.0 + ${lucene.version} org.apache.lucene lucene-spellchecker - 2.2.0 + ${lucene.version} com.github.jtidy diff --git a/java/pom.xml b/java/pom.xml index e3c4e5750..c897b8cfc 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -37,7 +37,7 @@ org.apache.commons commons-collections4 - 4.1 + ${commons.collections4.version} org.apache.logging.log4j @@ -110,7 +110,7 @@ org.apache.pdfbox pdfbox - 3.0.3 + ${pdfbox.version} org.jsoup diff --git a/pom.xml b/pom.xml index ac5559817..d996afcc0 100644 --- a/pom.xml +++ b/pom.xml @@ -22,6 +22,10 @@ 3.0.17 UTF-8 5.4.1 + 3.0.3 + 2.2.0 + 4.1 + 1.2 2.16.2 4.13.2 2.40.8 @@ -130,6 +134,7 @@ gamutils gamtotp gxcloudstorage-azureblob-latest + gxflowfulltextsearch