Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion oak-examples/standalone/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<exclusions>
<exclusion>
Expand Down
2 changes: 1 addition & 1 deletion oak-examples/webapp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<exclusions>
<exclusion>
Expand Down
2 changes: 1 addition & 1 deletion oak-lucene/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<scope>test</scope>
<exclusions>
Expand Down
2 changes: 1 addition & 1 deletion oak-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
<slf4j.version>1.7.36</slf4j.version> <!-- sync with logback version -->
<logback.version>1.2.13</logback.version>
<h2.version>2.1.214</h2.version>
<tika.version>1.28.5</tika.version>
<tika.version>3.2.3</tika.version>
<derby.version>10.15.2.0</derby.version>
<jackson.version>2.17.3</jackson.version>
<testcontainers.version>1.21.1</testcontainers.version>
Expand Down
2 changes: 1 addition & 1 deletion oak-pojosr/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<scope>test</scope>
<exclusions>
Expand Down
5 changes: 3 additions & 2 deletions oak-run/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
<jetty.version>9.4.53.v20231009</jetty.version>
<!--
Size History:
+ 2 MB (Tika 3.2.3, OAK-9752)
+ 87 MB (Aws Sdk 2.x, OAK-11935)
+ 84 MB (RDB/Tomcat, OAK-10752)
+ 80 MB (Java 17, OAK-10638)
Expand All @@ -52,7 +53,7 @@
+ 41 MB build failing on the release profile (OAK-6250)
+ 38 MB. Initial value. Current 35MB plus a 10%
-->
<max.jar.size>91226112</max.jar.size>
<max.jar.size>93585333</max.jar.size>
</properties>

<build>
Expand Down Expand Up @@ -364,7 +365,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
Expand Down
4 changes: 2 additions & 2 deletions oak-run/src/main/assembly/oak-run.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
<excludes>
<exclude>org.apache.lucene</exclude>
<exclude>org.apache.tika:tika-core:*</exclude>
<exclude>org.apache.tika:tika-parsers:*</exclude>
<exclude>org.apache.tika:tika-parsers-standard-package:*</exclude>
<exclude>org.apache.jackrabbit:jackrabbit-aws-ext:*</exclude>
<exclude>io.prometheus:simpleclient*:*</exclude>
</excludes>
Expand Down Expand Up @@ -60,7 +60,7 @@
<outputDirectory>/</outputDirectory>
<includes>
<include>org.apache.tika:tika-core</include>
<include>org.apache.tika:tika-parsers</include>
<include>org.apache.tika:tika-parsers-standard-package</include>
<include>commons-logging:commons-logging</include>
</includes>
<useStrictFiltering>true</useStrictFiltering>
Expand Down
2 changes: 1 addition & 1 deletion oak-search-elastic/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<scope>test</scope>
<exclusions>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,22 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;

import javax.xml.parsers.DocumentBuilder;

import org.apache.jackrabbit.oak.commons.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

public class TikaParserConfig {

private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
private static final Logger log = LoggerFactory.getLogger(TikaParserConfig.class);

/**
* Determines the set of MediaType which have been configured with an EmptyParser.
Expand All @@ -48,50 +48,32 @@ public class TikaParserConfig {
public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws
TikaException, IOException, SAXException {
Set<MediaType> result = new HashSet<>();
Element element = getBuilder().parse(configStream).getDocumentElement();
NodeList nodes = element.getElementsByTagName("parsers");
if (nodes.getLength() == 1) {
Node parentNode = nodes.item(0);
NodeList parsersNodes = parentNode.getChildNodes();
for (int i = 0; i < parsersNodes.getLength(); i++) {
Node node = parsersNodes.item(i);
if (node instanceof Element) {
String className = ((Element) node).getAttribute("class");
if (EMPTY_PARSER.equals(className)) {
NodeList mimes = ((Element) node).getElementsByTagName("mime");
parseMimeTypes(result, mimes);
}
}
TikaConfig config = new TikaConfig(configStream);
if (config.getParser() instanceof org.apache.tika.parser.CompositeParser) {
// pick the (decorated) empty parser
Optional<Parser> emptyParser = ((org.apache.tika.parser.CompositeParser) config.getParser()).getAllComponentParsers().stream()
.filter(p -> isEmptyParser(p))
.findFirst();
if (emptyParser.isPresent()) {
emptyParser.get().getSupportedTypes(new ParseContext()).forEach(result::add);
}
} else {
log.debug("Tika CompositeParser not used, no parsers configured via custom tika config");
}
return result;
}


private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
/*
<parser class="org.apache.tika.parser.EmptyParser">
<mime>application/x-archive</mime>
<mime>application/x-bzip</mime>
<mime>application/x-bzip2</mime>
</parser>
*/
for (int j = 0; j < mimes.getLength(); j++) {
Node mime = mimes.item(j);
if (mime instanceof Element) {
String mimeValue = mime.getTextContent();
mimeValue = StringUtils.emptyToNull(mimeValue);
if (mimeValue != null) {
MediaType mediaType = MediaType.parse(mimeValue.trim());
if (mediaType != null) {
result.add(mediaType);
}
}
}
/**
* Returns true if the given parser is an EmptyParser or decorates an EmptyParser.
* @param parser
* @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser
*/
private static boolean isEmptyParser(Parser parser) {
if (parser instanceof org.apache.tika.parser.EmptyParser) {
return true;
} else if (parser instanceof org.apache.tika.parser.ParserDecorator) {
return isEmptyParser(((ParserDecorator) parser).getWrappedParser());
}
}

private static DocumentBuilder getBuilder() throws TikaException {
return new ParseContext().getDocumentBuilder();
return false;
}
}
Loading