diff --git a/oak-examples/standalone/pom.xml b/oak-examples/standalone/pom.xml
index 78392806f99..ccd4f0788f9 100644
--- a/oak-examples/standalone/pom.xml
+++ b/oak-examples/standalone/pom.xml
@@ -129,7 +129,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
diff --git a/oak-examples/webapp/pom.xml b/oak-examples/webapp/pom.xml
index e9598625caa..871f3a1db85 100644
--- a/oak-examples/webapp/pom.xml
+++ b/oak-examples/webapp/pom.xml
@@ -51,7 +51,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
diff --git a/oak-lucene/pom.xml b/oak-lucene/pom.xml
index b5b411cdd1d..b3adf7489da 100644
--- a/oak-lucene/pom.xml
+++ b/oak-lucene/pom.xml
@@ -389,7 +389,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
test
diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml
index e8bab712107..861a469025e 100644
--- a/oak-parent/pom.xml
+++ b/oak-parent/pom.xml
@@ -62,7 +62,7 @@
1.7.36
1.2.13
2.1.214
- 1.28.5
+ 3.2.3
10.15.2.0
2.17.3
1.21.1
diff --git a/oak-pojosr/pom.xml b/oak-pojosr/pom.xml
index a332470978d..bef72253b1b 100644
--- a/oak-pojosr/pom.xml
+++ b/oak-pojosr/pom.xml
@@ -192,7 +192,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
test
diff --git a/oak-run/pom.xml b/oak-run/pom.xml
index 7001f086e87..f1ae7d0e3fe 100644
--- a/oak-run/pom.xml
+++ b/oak-run/pom.xml
@@ -34,6 +34,7 @@
9.4.53.v20231009
- 91226112
+ 93585333
@@ -364,7 +365,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
diff --git a/oak-run/src/main/assembly/oak-run.xml b/oak-run/src/main/assembly/oak-run.xml
index d9b5d9dbdef..0cb9a2b9fc2 100644
--- a/oak-run/src/main/assembly/oak-run.xml
+++ b/oak-run/src/main/assembly/oak-run.xml
@@ -32,7 +32,7 @@
org.apache.lucene
org.apache.tika:tika-core:*
- org.apache.tika:tika-parsers:*
+ org.apache.tika:tika-parsers-standard-package:*
org.apache.jackrabbit:jackrabbit-aws-ext:*
io.prometheus:simpleclient*:*
@@ -60,7 +60,7 @@
/
org.apache.tika:tika-core
- org.apache.tika:tika-parsers
+ org.apache.tika:tika-parsers-standard-package
commons-logging:commons-logging
true
diff --git a/oak-search-elastic/pom.xml b/oak-search-elastic/pom.xml
index ec2d14bc037..e91fed571de 100644
--- a/oak-search-elastic/pom.xml
+++ b/oak-search-elastic/pom.xml
@@ -146,7 +146,7 @@
org.apache.tika
- tika-parsers
+ tika-parsers-standard-package
${tika.version}
test
diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
index 447cc0a582c..e0c8ff6f7e6 100644
--- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
+++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java
@@ -22,22 +22,22 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
+import java.util.Optional;
import java.util.Set;
-import javax.xml.parsers.DocumentBuilder;
-
-import org.apache.jackrabbit.oak.commons.StringUtils;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
public class TikaParserConfig {
- private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
+ private static final Logger log = LoggerFactory.getLogger(TikaParserConfig.class);
/**
* Determines the set of MediaType which have been configured with an EmptyParser.
@@ -48,50 +48,32 @@ public class TikaParserConfig {
public static Set getNonIndexedMediaTypes(InputStream configStream) throws
TikaException, IOException, SAXException {
Set result = new HashSet<>();
- Element element = getBuilder().parse(configStream).getDocumentElement();
- NodeList nodes = element.getElementsByTagName("parsers");
- if (nodes.getLength() == 1) {
- Node parentNode = nodes.item(0);
- NodeList parsersNodes = parentNode.getChildNodes();
- for (int i = 0; i < parsersNodes.getLength(); i++) {
- Node node = parsersNodes.item(i);
- if (node instanceof Element) {
- String className = ((Element) node).getAttribute("class");
- if (EMPTY_PARSER.equals(className)) {
- NodeList mimes = ((Element) node).getElementsByTagName("mime");
- parseMimeTypes(result, mimes);
- }
- }
+ TikaConfig config = new TikaConfig(configStream);
+ if (config.getParser() instanceof org.apache.tika.parser.CompositeParser) {
+ // pick the (decorated) empty parser
+ Optional emptyParser = ((org.apache.tika.parser.CompositeParser) config.getParser()).getAllComponentParsers().stream()
+ .filter(p -> isEmptyParser(p))
+ .findFirst();
+ if (emptyParser.isPresent()) {
+ emptyParser.get().getSupportedTypes(new ParseContext()).forEach(result::add);
}
+ } else {
+ log.debug("Tika CompositeParser not used, no parsers configured via custom tika config");
}
return result;
}
-
- private static void parseMimeTypes(Set result, NodeList mimes) {
- /*
-
- application/x-archive
- application/x-bzip
- application/x-bzip2
-
- */
- for (int j = 0; j < mimes.getLength(); j++) {
- Node mime = mimes.item(j);
- if (mime instanceof Element) {
- String mimeValue = mime.getTextContent();
- mimeValue = StringUtils.emptyToNull(mimeValue);
- if (mimeValue != null) {
- MediaType mediaType = MediaType.parse(mimeValue.trim());
- if (mediaType != null) {
- result.add(mediaType);
- }
- }
- }
+ /**
+ * Returns true if the given parser is an EmptyParser or decorates an EmptyParser.
+ * @param parser
+ * @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser
+ */
+ private static boolean isEmptyParser(Parser parser) {
+ if (parser instanceof org.apache.tika.parser.EmptyParser) {
+ return true;
+ } else if (parser instanceof org.apache.tika.parser.ParserDecorator) {
+ return isEmptyParser(((ParserDecorator) parser).getWrappedParser());
}
- }
-
- private static DocumentBuilder getBuilder() throws TikaException {
- return new ParseContext().getDocumentBuilder();
+ return false;
}
}