diff --git a/checkstyle.xml b/checkstyle.xml index a10df412a..76b58c95f 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -95,9 +95,7 @@ under the License. - - - + @@ -254,7 +252,6 @@ under the License. - - - @@ -387,7 +382,6 @@ under the License. LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_WHILE, METHOD_CALL, METHOD_DEF, QUESTION, RESOURCE_SPECIFICATION, SUPER_CTOR_CALL, LAMBDA, RECORD_DEF, RECORD_PATTERN_DEF"/> - @@ -410,9 +404,7 @@ under the License. - - - + @@ -470,7 +462,6 @@ under the License. value="^(?![a-z]$)(?![a-z][A-Z])[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)*(?:_[0-9]+)*$"/> - diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 26c9afedd..aff263f4a 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -183,184 +183,184 @@ public void loadJSONResources(InputStream inputStream) } return urlToFilter; } -} - -class Rules { - - private Scope globalRules; - private Map domainRules = new HashMap<>(); - private Map hostNameRules = new HashMap<>(); - private List metadataRules = new ArrayList<>(); - - public void addScope(Scope s, Scope.Type t, String value) { - if (t.equals(Scope.Type.GLOBAL)) { - globalRules = s; - } else if (t.equals(Scope.Type.DOMAIN)) { - domainRules.put(value, s); - } else if (t.equals(Scope.Type.HOSTNAME)) { - hostNameRules.put(value, s); - } else if (t.equals(Scope.Type.METADATA)) { - metadataRules.add(new MDScope(value, s.getRules())); - } - } - /** - * Try the rules from the hostname, domain name, metadata and global scopes in this order. - * Returns true if the URL should be removed, false otherwise. The value returns the value of - * the first matching rule, be it positive or negative. - * - * @throws MalformedURLException - */ - public boolean filter(String url, Metadata metadata) throws MalformedURLException { - URL u = URLUtil.toURL(url); - - // first try the full hostname - String hostname = u.getHost(); - if (checkScope(hostNameRules.get(hostname), u)) { - return true; + static class Rules { + + private Scope globalRules; + private Map domainRules = new HashMap<>(); + private Map hostNameRules = new HashMap<>(); + private List metadataRules = new ArrayList<>(); + + public void addScope(Scope s, Scope.Type t, String value) { + if (t.equals(Scope.Type.GLOBAL)) { + globalRules = s; + } else if (t.equals(Scope.Type.DOMAIN)) { + domainRules.put(value, s); + } else if (t.equals(Scope.Type.HOSTNAME)) { + hostNameRules.put(value, s); + } else if (t.equals(Scope.Type.METADATA)) { + metadataRules.add(new MDScope(value, s.getRules())); + } } - // then on the various components of the domain - final String[] domainParts = hostname.split("\\."); - String domain = null; - for (int i = domainParts.length - 1; i >= 0; i--) { - domain = domainParts[i] + (domain == null ? "" : "." + domain); - if (checkScope(domainRules.get(domain), u)) { + /** + * Try the rules from the hostname, domain name, metadata and global scopes in this order. + * Returns true if the URL should be removed, false otherwise. The value returns the value + * of the first matching rule, be it positive or negative. + * + * @throws MalformedURLException + */ + public boolean filter(String url, Metadata metadata) throws MalformedURLException { + URL u = URLUtil.toURL(url); + + // first try the full hostname + String hostname = u.getHost(); + if (checkScope(hostNameRules.get(hostname), u)) { return true; } - } - // check on parent's URL metadata - for (MDScope scope : metadataRules) { - final String[] vals = metadata.getValues(scope.getKey()); - if (vals == null) { - continue; + // then on the various components of the domain + final String[] domainParts = hostname.split("\\."); + String domain = null; + for (int i = domainParts.length - 1; i >= 0; i--) { + domain = domainParts[i] + (domain == null ? "" : "." + domain); + if (checkScope(domainRules.get(domain), u)) { + return true; + } } - for (String v : vals) { - if (v.equalsIgnoreCase(scope.getValue())) { - FastURLFilter.LOG.debug( - "Filtering {} matching metadata {}:{}", - url, - scope.getKey(), - scope.getValue()); - if (checkScope(scope, u)) { - return true; + + // check on parent's URL metadata + for (MDScope scope : metadataRules) { + final String[] vals = metadata.getValues(scope.getKey()); + if (vals == null) { + continue; + } + for (String v : vals) { + if (v.equalsIgnoreCase(scope.getValue())) { + FastURLFilter.LOG.debug( + "Filtering {} matching metadata {}:{}", + url, + scope.getKey(), + scope.getValue()); + if (checkScope(scope, u)) { + return true; + } } } } - } - if (checkScope(globalRules, u)) { - return true; - } - - return false; - } + if (checkScope(globalRules, u)) { + return true; + } - private boolean checkScope(Scope s, URL u) { - if (s == null) { return false; } - for (Rule r : s.getRules()) { - String haystack = u.getPath(); - // whether to include the query as well? - if (r.getType().toString().endsWith("QUERY")) { - if (u.getQuery() != null) { - haystack += "?" + u.getQuery(); - } + + private boolean checkScope(Scope s, URL u) { + if (s == null) { + return false; } - if (r.getPattern().matcher(haystack).find()) { - // matches! returns true for DENY, false for ALLOW - return r.getType().toString().startsWith("DENY"); + for (Rule r : s.getRules()) { + String haystack = u.getPath(); + // whether to include the query as well? + if (r.getType().toString().endsWith("QUERY")) { + if (u.getQuery() != null) { + haystack += "?" + u.getQuery(); + } + } + if (r.getPattern().matcher(haystack).find()) { + // matches! returns true for DENY, false for ALLOW + return r.getType().toString().startsWith("DENY"); + } } + return false; } - return false; } -} -class Scope { + static class Scope { - public enum Type { - DOMAIN, - GLOBAL, - HOSTNAME, - METADATA - } + public enum Type { + DOMAIN, + GLOBAL, + HOSTNAME, + METADATA + } - protected Rule[] rules; + protected Rule[] rules; - public void setRules(List rlist) { - this.rules = rlist.toArray(new Rule[0]); - } + public void setRules(List rlist) { + this.rules = rlist.toArray(new Rule[0]); + } - public Rule[] getRules() { - return rules; + public Rule[] getRules() { + return rules; + } } -} -class MDScope extends Scope { + static class MDScope extends Scope { - private String key; - private String value; + private String key; + private String value; - MDScope(String constraint, Rule[] rules) { - this.rules = rules; - int eq = constraint.indexOf("="); - if (eq != -1) { - key = constraint.substring(0, eq); - value = constraint.substring(eq + 1); - } else { - key = constraint; + MDScope(String constraint, Rule[] rules) { + this.rules = rules; + int eq = constraint.indexOf("="); + if (eq != -1) { + key = constraint.substring(0, eq); + value = constraint.substring(eq + 1); + } else { + key = constraint; + } } - } - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public String getValue() { - return value; + public String getValue() { + return value; + } } -} -class Rule { + static class Rule { - public enum Type { - DENYPATH, - DENYPATHQUERY, - ALLOWPATH, - ALLOWPATHQUERY - } + public enum Type { + DENYPATH, + DENYPATHQUERY, + ALLOWPATH, + ALLOWPATHQUERY + } - private Type type; - private Pattern pattern; - - public Rule(String line) { - int offset = 0; - String lcline = line.toLowerCase(Locale.ROOT); - // separate the type from the pattern - for (Type t : Type.values()) { - String start = t.toString().toLowerCase(Locale.ROOT) + " "; - if (lcline.startsWith(start)) { - type = t; - offset = start.length(); - break; + private Type type; + private Pattern pattern; + + public Rule(String line) { + int offset = 0; + String lcline = line.toLowerCase(Locale.ROOT); + // separate the type from the pattern + for (Type t : Type.values()) { + String start = t.toString().toLowerCase(Locale.ROOT) + " "; + if (lcline.startsWith(start)) { + type = t; + offset = start.length(); + break; + } + } + // no match? + if (type == null) { + return; } - } - // no match? - if (type == null) { - return; - } - String patternString = line.substring(offset).trim(); - pattern = Pattern.compile(patternString); - } + String patternString = line.substring(offset).trim(); + pattern = Pattern.compile(patternString); + } - public Type getType() { - return type; - } + public Type getType() { + return type; + } - public Pattern getPattern() { - return pattern; + public Pattern getPattern() { + return pattern; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java index ba5c5cd05..9181ff853 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java @@ -125,78 +125,78 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse.get(url).getMetadata().setValues(key, tags); } } -} -class Collections { + static class Collections { - private Set collections; + private Set collections; - public void setCollections(Set collections) { - this.collections = collections; - } + public void setCollections(Set collections) { + this.collections = collections; + } - public String[] tag(String url) { - Set tags = new HashSet<>(); - for (Collection collection : collections) { - if (collection.matches(url)) { - tags.add(collection.getName()); + public String[] tag(String url) { + Set tags = new HashSet<>(); + for (Collection collection : collections) { + if (collection.matches(url)) { + tags.add(collection.getName()); + } } + return tags.toArray(new String[0]); } - return tags.toArray(new String[0]); } -} - -class Collection { - - private String name; - private Set includePatterns; - private Set excludePatterns; - public String getName() { - return name; - } + static class Collection { - public void setName(String name) { - this.name = name; - } + private String name; + private Set includePatterns; + private Set excludePatterns; - /** - * @return true if the URL matches a pattern for this collection and no exclusion patterns - */ - public boolean matches(String url) { - boolean matches = false; - for (Pattern includeP : includePatterns) { - Matcher m = includeP.matcher(url); - if (m.matches()) { - matches = true; - break; - } - } - // no match - if (!matches) { - return false; + public String getName() { + return name; } - if (excludePatterns == null) { - return true; + public void setName(String name) { + this.name = name; } - // check for antipatterns - for (Pattern excludeP : excludePatterns) { - Matcher m = excludeP.matcher(url); - if (m.matches()) { + /** + * @return true if the URL matches a pattern for this collection and no exclusion patterns + */ + public boolean matches(String url) { + boolean matches = false; + for (Pattern includeP : includePatterns) { + Matcher m = includeP.matcher(url); + if (m.matches()) { + matches = true; + break; + } + } + // no match + if (!matches) { return false; } - } - return true; - } + if (excludePatterns == null) { + return true; + } - public void setIncludePatterns(Set includePatterns) { - this.includePatterns = includePatterns; - } + // check for antipatterns + for (Pattern excludeP : excludePatterns) { + Matcher m = excludeP.matcher(url); + if (m.matches()) { + return false; + } + } - public void setExcludePatterns(Set excludePatterns) { - this.excludePatterns = excludePatterns; + return true; + } + + public void setIncludePatterns(Set includePatterns) { + this.includePatterns = includePatterns; + } + + public void setExcludePatterns(Set excludePatterns) { + this.excludePatterns = excludePatterns; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java index 221dae683..6c9182c50 100644 --- a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java +++ b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java @@ -160,39 +160,39 @@ public void deactivate() { super.deactivate(); active = false; } -} - -class ScheduledURL implements Comparable { - Date nextFetchDate; - String url; - Metadata metadata; - - ScheduledURL(String url, Metadata m, Date nextFetchDate) { - this.nextFetchDate = nextFetchDate; - this.url = url; - this.metadata = m; - } - @Override - public String toString() { - return url + "\t" + nextFetchDate; - } + static class ScheduledURL implements Comparable { + Date nextFetchDate; + String url; + Metadata metadata; - /** Sort by next fetch date then URl. * */ - @Override - public int compareTo(ScheduledURL o) { - // compare the URL - int compString = url.compareTo(o.url); - if (compString == 0) { - return 0; + ScheduledURL(String url, Metadata m, Date nextFetchDate) { + this.nextFetchDate = nextFetchDate; + this.url = url; + this.metadata = m; } - // compare the date - int comp = nextFetchDate.compareTo(o.nextFetchDate); - if (comp != 0) { - return comp; + @Override + public String toString() { + return url + "\t" + nextFetchDate; } - return compString; + /** Sort by next fetch date then URl. * */ + @Override + public int compareTo(ScheduledURL o) { + // compare the URL + int compString = url.compareTo(o.url); + if (compString == 0) { + return 0; + } + + // compare the date + int comp = nextFetchDate.compareTo(o.nextFetchDate); + if (comp != 0) { + return comp; + } + + return compString; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java index 3a6fd83e2..38592bfee 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java +++ b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java @@ -123,7 +123,7 @@ protected void configure(Map conf) { * the URL path. */ public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata parentMetadata) { - Metadata md = _filter(parentMetadata, mdToTransfer); + Metadata md = filter(parentMetadata, mdToTransfer); // keep the path? if (trackPath) { @@ -150,11 +150,11 @@ public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata p * not necessarily transferred to the outlinks. */ public Metadata filter(Metadata metadata) { - Metadata filteredMetadata = _filter(metadata, mdToTransfer); + Metadata filteredMetadata = filter(metadata, mdToTransfer); // add the features that are only persisted but // not transferred like __redirTo_ - filteredMetadata.putAll(_filter(metadata, mdToPersistOnly)); + filteredMetadata.putAll(filter(metadata, mdToPersistOnly)); return filteredMetadata; } @@ -163,7 +163,7 @@ public Metadata filter(Metadata metadata) { * Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting * with the prefix will be added. */ - private Metadata _filter(Metadata metadata, Set filter) { + private Metadata filter(Metadata metadata, Set filter) { Metadata filteredMetadata = new Metadata(); for (String key : filter) { diff --git a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java index 8c886152a..cbb8a140e 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java +++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java @@ -20,7 +20,7 @@ import org.apache.storm.metric.api.IReducer; /** Used to return an average value per second. */ -public class PerSecondReducer implements IReducer { +public class PerSecondReducer implements IReducer { @Override public TimeReducerState init() { @@ -54,9 +54,9 @@ public Object extractResult(TimeReducerState accumulator) { double permsec = accumulator.sum / msec; return permsec * 1000d; } -} -class TimeReducerState { - public long started = System.currentTimeMillis(); - public double sum = 0.0; + static class TimeReducerState { + public long started = System.currentTimeMillis(); + public double sum = 0.0; + } } diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java index cca978a59..2942c6a56 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java @@ -115,8 +115,8 @@ void setupParserBolt() { setupParserBolt(bolt); } + /** Checks that content in script is not included in the text representation. */ @Test - /** Checks that content in script is not included in the text representation */ void testNoScriptInText() throws IOException { bolt.prepare( new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); @@ -129,8 +129,8 @@ void testNoScriptInText() throws IOException { "Text should not contain the content of script tags"); } + /** Checks that individual links marked as rel="nofollow" are not followed. */ @Test - /** Checks that individual links marked as rel="nofollow" are not followed */ void testNoFollowOutlinks() throws IOException { bolt.prepare( new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java index 6e9268318..ee831691d 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java @@ -39,7 +39,7 @@ public class SubDocumentsParseFilter extends ParseFilter { LoggerFactory.getLogger(SubDocumentsParseFilter.class); @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { InputStream stream = new ByteArrayInputStream(content); @@ -69,7 +69,7 @@ public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult } } } catch (Exception e) { - LOG.error("Error processing sitemap from {}: {}", URL, e); + LOG.error("Error processing sitemap from {}: {}", url, e); } } diff --git a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java index 29351fd7e..639ae2877 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java @@ -32,7 +32,7 @@ class InitialisationUtilTest { @Test - void can_initialize_a_simple_class() { + void can_initialize_simple_class() { final SimpleOpenClass simpleOpenClass = InitialisationUtil.initializeFromQualifiedName( SimpleOpenClass.class.getName(), SimpleOpenClass.class); @@ -150,7 +150,7 @@ void fails_if_qualified_class_name_is_blank() { } @Test - void fails_if_class_to_initialize_not_extending_classes_to_test_1() { + void fails_if_class_to_initialize_not_extending_classes_to_test() { Assertions.assertThrows( RuntimeException.class, () -> @@ -161,7 +161,7 @@ void fails_if_class_to_initialize_not_extending_classes_to_test_1() { } @Test - void fails_if_class_to_initialize_not_extending_classes_to_test_2() { + void fails_if_class_to_initialize_not_extending_classes_to_test_other() { Assertions.assertThrows( RuntimeException.class, () -> diff --git a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java index d28a6d941..b2b297ad1 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java @@ -68,7 +68,7 @@ void testCustomTransferClass() throws MalformedURLException { conf = new HashMap<>(); conf.put( MetadataTransfer.metadataTransferClassParamName, - myCustomTransferClass.class.getName()); + MyCustomTransferClass.class.getName()); hasThrownException = false; try { MetadataTransfer.getInstance(conf); @@ -112,6 +112,6 @@ void testFilterWithAsterisk() { filteredMetadata = mdt.filter(metadata); Assertions.assertEquals(6, filteredMetadata.size()); } -} -class myCustomTransferClass extends MetadataTransfer {} + static class MyCustomTransferClass extends MetadataTransfer {} +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java index a5946cea3..6f6d7be66 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -66,8 +66,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - refresher.getDelegate().filter(URL, content, doc, parse); + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { + refresher.getDelegate().filter(url, content, doc, parse); } @Override diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java index 920ad5dea..67bcca62b 100644 --- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java @@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {} + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {} @Override public String getResourceFile() { diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java index 13fa40fba..d8995ff87 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -69,8 +69,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - refresher.getDelegate().filter(URL, content, doc, parse); + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { + refresher.getDelegate().filter(url, content, doc, parse); } @Override diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java index 920ad5dea..67bcca62b 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java @@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {} + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {} @Override public String getResourceFile() { diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java index 301b106dd..420b9a0bc 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java @@ -229,7 +229,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except // NOTE: The handler will only be called for the first url if the // response is a redirect. page.route( - _url -> true, + lambdaUrl -> true, route -> { // abort if we know the main page is a redirection if (status.get() != -1) { diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java index fc4e5e33a..425a0a0cc 100644 --- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java +++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java @@ -45,12 +45,12 @@ void setupParserBolt() { setupParserBolt(bolt); } - @Test /** - * Checks that recursive docs are handled correctly + * Checks that recursive docs are handled correctly. * - * @see https://issues.apache.org/jira/browse/TIKA-2096 + * @see TIKA-2096 */ + @Test void testRecursiveDoc() throws IOException { Map conf = new HashMap<>(); conf.put("parser.extract.embedded", true); @@ -71,12 +71,12 @@ void testRecursiveDoc() throws IOException { .contains("Life, Liberty and the pursuit of Happiness")); } - @Test /** - * Checks that the mimetype whitelists are handled correctly + * Checks that the mimetype whitelists are handled correctly. * - * @see https://github.com/apache/stormcrawler/issues/712 + * @see #712 */ + @Test void testMimeTypeWhileList() throws IOException { Map conf = new HashMap<>(); conf.put("parser.mimetype.whitelist", "application/.+word.*"); diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java index 8bb1937d3..5b77a7289 100644 --- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java +++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java @@ -59,8 +59,8 @@ public WARCHdfsBolt() { withFsUrl("file:///"); } - public WARCHdfsBolt withHeader(Map header_fields) { - this.header_fields = header_fields; + public WARCHdfsBolt withHeader(Map headerFields) { + this.header_fields = headerFields; return this; } diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java index 0dbac9bb9..662355122 100644 --- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java +++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java @@ -190,34 +190,24 @@ private Tuple getPage(String httpVersionString) { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // + metadata.addValue( protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, "GET / " + httpVersionString - + // - "\r\n" - + // - "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n" - + // - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" - + // - "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" - + // - "Accept-Encoding: br,gzip\r\n" - + // - "Host: example.org\r\n" + + "\r\n" + + "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n" + + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" + + "Accept-Encoding: br,gzip\r\n" + + "Host: example.org\r\n" + "Connection: Keep-Alive\r\n\r\n"); - metadata.addValue( // + metadata.addValue( protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, httpVersionString - + // - " 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + " 200 OK\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close\r\n\r\n"); metadata.addValue( protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY, diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java index c02fc8e38..6c66204d7 100644 --- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java +++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java @@ -122,15 +122,12 @@ void testReplaceHeaders() { byte[] content = txt.getBytes(StandardCharsets.UTF_8); String sha1str = "sha1:D6FMCDZDYW23YELHXWUEXAZ6LQCXU56S"; Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, "HTTP/1.1 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close"); metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123"); Tuple tuple = mock(Tuple.class); @@ -165,15 +162,12 @@ void testReplaceHttpVersion() { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, "HTTP/2 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close"); metadata.addValue( protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY, @@ -208,17 +202,13 @@ void testRequestHeader() { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, "GET / HTTP/2\r\n" - + // - "User-Agent: mybot\r\n" - + // - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" - + // - "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" - + // - "Accept-Encoding: br,gzip\r\n" + + "User-Agent: mybot\r\n" + + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" + + "Accept-Encoding: br,gzip\r\n" + "Connection: Keep-Alive\r\n\r\n"); metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123"); Tuple tuple = mock(Tuple.class);