diff --git a/oak-examples/standalone/pom.xml b/oak-examples/standalone/pom.xml index 78392806f99..ccd4f0788f9 100644 --- a/oak-examples/standalone/pom.xml +++ b/oak-examples/standalone/pom.xml @@ -129,7 +129,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} diff --git a/oak-examples/webapp/pom.xml b/oak-examples/webapp/pom.xml index e9598625caa..871f3a1db85 100644 --- a/oak-examples/webapp/pom.xml +++ b/oak-examples/webapp/pom.xml @@ -51,7 +51,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} diff --git a/oak-lucene/pom.xml b/oak-lucene/pom.xml index b5b411cdd1d..b3adf7489da 100644 --- a/oak-lucene/pom.xml +++ b/oak-lucene/pom.xml @@ -389,7 +389,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} test diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml index e8bab712107..861a469025e 100644 --- a/oak-parent/pom.xml +++ b/oak-parent/pom.xml @@ -62,7 +62,7 @@ 1.7.36 1.2.13 2.1.214 - 1.28.5 + 3.2.3 10.15.2.0 2.17.3 1.21.1 diff --git a/oak-pojosr/pom.xml b/oak-pojosr/pom.xml index a332470978d..bef72253b1b 100644 --- a/oak-pojosr/pom.xml +++ b/oak-pojosr/pom.xml @@ -192,7 +192,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} test diff --git a/oak-run/pom.xml b/oak-run/pom.xml index 7001f086e87..f1ae7d0e3fe 100644 --- a/oak-run/pom.xml +++ b/oak-run/pom.xml @@ -34,6 +34,7 @@ 9.4.53.v20231009 - 91226112 + 93585333 @@ -364,7 +365,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} diff --git a/oak-run/src/main/assembly/oak-run.xml b/oak-run/src/main/assembly/oak-run.xml index d9b5d9dbdef..0cb9a2b9fc2 100644 --- a/oak-run/src/main/assembly/oak-run.xml +++ b/oak-run/src/main/assembly/oak-run.xml @@ -32,7 +32,7 @@ org.apache.lucene org.apache.tika:tika-core:* - org.apache.tika:tika-parsers:* + org.apache.tika:tika-parsers-standard-package:* org.apache.jackrabbit:jackrabbit-aws-ext:* io.prometheus:simpleclient*:* @@ -60,7 +60,7 @@ / org.apache.tika:tika-core - org.apache.tika:tika-parsers + org.apache.tika:tika-parsers-standard-package commons-logging:commons-logging true diff --git a/oak-search-elastic/pom.xml b/oak-search-elastic/pom.xml index ec2d14bc037..e91fed571de 100644 --- a/oak-search-elastic/pom.xml +++ b/oak-search-elastic/pom.xml @@ -146,7 +146,7 @@ org.apache.tika - tika-parsers + tika-parsers-standard-package ${tika.version} test diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java index 447cc0a582c..e0c8ff6f7e6 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java @@ -22,22 +22,22 @@ import java.io.IOException; import java.io.InputStream; import java.util.HashSet; +import java.util.Optional; import java.util.Set; -import javax.xml.parsers.DocumentBuilder; - -import org.apache.jackrabbit.oak.commons.StringUtils; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; public class TikaParserConfig { - private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser"; + private static final Logger log = LoggerFactory.getLogger(TikaParserConfig.class); /** * Determines the set of MediaType which have been configured with an EmptyParser. @@ -48,50 +48,32 @@ public class TikaParserConfig { public static Set getNonIndexedMediaTypes(InputStream configStream) throws TikaException, IOException, SAXException { Set result = new HashSet<>(); - Element element = getBuilder().parse(configStream).getDocumentElement(); - NodeList nodes = element.getElementsByTagName("parsers"); - if (nodes.getLength() == 1) { - Node parentNode = nodes.item(0); - NodeList parsersNodes = parentNode.getChildNodes(); - for (int i = 0; i < parsersNodes.getLength(); i++) { - Node node = parsersNodes.item(i); - if (node instanceof Element) { - String className = ((Element) node).getAttribute("class"); - if (EMPTY_PARSER.equals(className)) { - NodeList mimes = ((Element) node).getElementsByTagName("mime"); - parseMimeTypes(result, mimes); - } - } + TikaConfig config = new TikaConfig(configStream); + if (config.getParser() instanceof org.apache.tika.parser.CompositeParser) { + // pick the (decorated) empty parser + Optional emptyParser = ((org.apache.tika.parser.CompositeParser) config.getParser()).getAllComponentParsers().stream() + .filter(p -> isEmptyParser(p)) + .findFirst(); + if (emptyParser.isPresent()) { + emptyParser.get().getSupportedTypes(new ParseContext()).forEach(result::add); } + } else { + log.debug("Tika CompositeParser not used, no parsers configured via custom tika config"); } return result; } - - private static void parseMimeTypes(Set result, NodeList mimes) { - /* - - application/x-archive - application/x-bzip - application/x-bzip2 - - */ - for (int j = 0; j < mimes.getLength(); j++) { - Node mime = mimes.item(j); - if (mime instanceof Element) { - String mimeValue = mime.getTextContent(); - mimeValue = StringUtils.emptyToNull(mimeValue); - if (mimeValue != null) { - MediaType mediaType = MediaType.parse(mimeValue.trim()); - if (mediaType != null) { - result.add(mediaType); - } - } - } + /** + * Returns true if the given parser is an EmptyParser or decorates an EmptyParser. + * @param parser + * @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser + */ + private static boolean isEmptyParser(Parser parser) { + if (parser instanceof org.apache.tika.parser.EmptyParser) { + return true; + } else if (parser instanceof org.apache.tika.parser.ParserDecorator) { + return isEmptyParser(((ParserDecorator) parser).getWrappedParser()); } - } - - private static DocumentBuilder getBuilder() throws TikaException { - return new ParseContext().getDocumentBuilder(); + return false; } }