From f213b435c55e32a79a184c27c29318ef5b6fa5a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Fri, 15 May 2026 20:50:37 +0200 Subject: [PATCH 1/8] Fix Javadoc position issues and make them error severity --- checkstyle.xml | 4 +--- .../stormcrawler/bolt/JSoupParserBoltTest.java | 4 ++-- .../org/apache/stormcrawler/tika/ParserBoltTest.java | 12 ++++++------ 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index a10df412a..6c1cdb6fd 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -410,9 +410,7 @@ under the License. - - - + diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java index cca978a59..2942c6a56 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java @@ -115,8 +115,8 @@ void setupParserBolt() { setupParserBolt(bolt); } + /** Checks that content in script is not included in the text representation. */ @Test - /** Checks that content in script is not included in the text representation */ void testNoScriptInText() throws IOException { bolt.prepare( new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); @@ -129,8 +129,8 @@ void testNoScriptInText() throws IOException { "Text should not contain the content of script tags"); } + /** Checks that individual links marked as rel="nofollow" are not followed. */ @Test - /** Checks that individual links marked as rel="nofollow" are not followed */ void testNoFollowOutlinks() throws IOException { bolt.prepare( new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java index fc4e5e33a..425a0a0cc 100644 --- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java +++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java @@ -45,12 +45,12 @@ void setupParserBolt() { setupParserBolt(bolt); } - @Test /** - * Checks that recursive docs are handled correctly + * Checks that recursive docs are handled correctly. * - * @see https://issues.apache.org/jira/browse/TIKA-2096 + * @see TIKA-2096 */ + @Test void testRecursiveDoc() throws IOException { Map conf = new HashMap<>(); conf.put("parser.extract.embedded", true); @@ -71,12 +71,12 @@ void testRecursiveDoc() throws IOException { .contains("Life, Liberty and the pursuit of Happiness")); } - @Test /** - * Checks that the mimetype whitelists are handled correctly + * Checks that the mimetype whitelists are handled correctly. * - * @see https://github.com/apache/stormcrawler/issues/712 + * @see #712 */ + @Test void testMimeTypeWhileList() throws IOException { Map conf = new HashMap<>(); conf.put("parser.mimetype.whitelist", "application/.+word.*"); From 139d2f96b3f0b31621e4ee9bb1f6bf2151668106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Fri, 15 May 2026 21:00:32 +0200 Subject: [PATCH 2/8] Fix Lambda parameter name issues and make them error severity --- checkstyle.xml | 1 - .../apache/stormcrawler/protocol/playwright/HttpProtocol.java | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index 6c1cdb6fd..1feacf8ae 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -279,7 +279,6 @@ under the License. - diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java index 301b106dd..420b9a0bc 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java @@ -229,7 +229,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except // NOTE: The handler will only be called for the first url if the // response is a redirect. page.route( - _url -> true, + lambdaUrl -> true, route -> { // abort if we know the main page is a redirection if (status.get() != -1) { From 237f0f70d534076db1028d46e211c44dc09e6c22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Fri, 15 May 2026 21:06:46 +0200 Subject: [PATCH 3/8] Fix Type name issues and make them error severity --- checkstyle.xml | 1 - .../org/apache/stormcrawler/util/MetadataTransferTest.java | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index 1feacf8ae..00c445f53 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -254,7 +254,6 @@ under the License. - - diff --git a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java index 3a6fd83e2..38592bfee 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java +++ b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java @@ -123,7 +123,7 @@ protected void configure(Map conf) { * the URL path. */ public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata parentMetadata) { - Metadata md = _filter(parentMetadata, mdToTransfer); + Metadata md = filter(parentMetadata, mdToTransfer); // keep the path? if (trackPath) { @@ -150,11 +150,11 @@ public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata p * not necessarily transferred to the outlinks. */ public Metadata filter(Metadata metadata) { - Metadata filteredMetadata = _filter(metadata, mdToTransfer); + Metadata filteredMetadata = filter(metadata, mdToTransfer); // add the features that are only persisted but // not transferred like __redirTo_ - filteredMetadata.putAll(_filter(metadata, mdToPersistOnly)); + filteredMetadata.putAll(filter(metadata, mdToPersistOnly)); return filteredMetadata; } @@ -163,7 +163,7 @@ public Metadata filter(Metadata metadata) { * Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting * with the prefix will be added. */ - private Metadata _filter(Metadata metadata, Set filter) { + private Metadata filter(Metadata metadata, Set filter) { Metadata filteredMetadata = new Metadata(); for (String key : filter) { diff --git a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java index 29351fd7e..639ae2877 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java @@ -32,7 +32,7 @@ class InitialisationUtilTest { @Test - void can_initialize_a_simple_class() { + void can_initialize_simple_class() { final SimpleOpenClass simpleOpenClass = InitialisationUtil.initializeFromQualifiedName( SimpleOpenClass.class.getName(), SimpleOpenClass.class); @@ -150,7 +150,7 @@ void fails_if_qualified_class_name_is_blank() { } @Test - void fails_if_class_to_initialize_not_extending_classes_to_test_1() { + void fails_if_class_to_initialize_not_extending_classes_to_test() { Assertions.assertThrows( RuntimeException.class, () -> @@ -161,7 +161,7 @@ void fails_if_class_to_initialize_not_extending_classes_to_test_1() { } @Test - void fails_if_class_to_initialize_not_extending_classes_to_test_2() { + void fails_if_class_to_initialize_not_extending_classes_to_test_other() { Assertions.assertThrows( RuntimeException.class, () -> From e87b405966d4740c0c473ba82a7bd29ce81768d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Fri, 15 May 2026 21:21:05 +0200 Subject: [PATCH 5/8] Fix Parenthesis padding issues and make them error severity --- checkstyle.xml | 1 - .../stormcrawler/warc/WARCHdfsBoltTest.java | 34 ++++++--------- .../warc/WARCRecordFormatTest.java | 42 +++++++------------ 3 files changed, 28 insertions(+), 49 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index a8b2e20ad..ac10c10d5 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -385,7 +385,6 @@ under the License. LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_WHILE, METHOD_CALL, METHOD_DEF, QUESTION, RESOURCE_SPECIFICATION, SUPER_CTOR_CALL, LAMBDA, RECORD_DEF, RECORD_PATTERN_DEF"/> - diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java index 0dbac9bb9..662355122 100644 --- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java +++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java @@ -190,34 +190,24 @@ private Tuple getPage(String httpVersionString) { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // + metadata.addValue( protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, "GET / " + httpVersionString - + // - "\r\n" - + // - "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n" - + // - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" - + // - "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" - + // - "Accept-Encoding: br,gzip\r\n" - + // - "Host: example.org\r\n" + + "\r\n" + + "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n" + + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" + + "Accept-Encoding: br,gzip\r\n" + + "Host: example.org\r\n" + "Connection: Keep-Alive\r\n\r\n"); - metadata.addValue( // + metadata.addValue( protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, httpVersionString - + // - " 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + " 200 OK\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close\r\n\r\n"); metadata.addValue( protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY, diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java index c02fc8e38..6c66204d7 100644 --- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java +++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java @@ -122,15 +122,12 @@ void testReplaceHeaders() { byte[] content = txt.getBytes(StandardCharsets.UTF_8); String sha1str = "sha1:D6FMCDZDYW23YELHXWUEXAZ6LQCXU56S"; Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, "HTTP/1.1 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close"); metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123"); Tuple tuple = mock(Tuple.class); @@ -165,15 +162,12 @@ void testReplaceHttpVersion() { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, "HTTP/2 200 OK\r\n" - + // - "Content-Type: text/html\r\n" - + // - "Content-Encoding: gzip\r\n" - + // - "Content-Length: 26\r\n" + + "Content-Type: text/html\r\n" + + "Content-Encoding: gzip\r\n" + + "Content-Length: 26\r\n" + "Connection: close"); metadata.addValue( protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY, @@ -208,17 +202,13 @@ void testRequestHeader() { String txt = "abcdef"; byte[] content = txt.getBytes(StandardCharsets.UTF_8); Metadata metadata = new Metadata(); - metadata.addValue( // - protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, // + metadata.addValue( + protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, "GET / HTTP/2\r\n" - + // - "User-Agent: mybot\r\n" - + // - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" - + // - "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" - + // - "Accept-Encoding: br,gzip\r\n" + + "User-Agent: mybot\r\n" + + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" + + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n" + + "Accept-Encoding: br,gzip\r\n" + "Connection: Keep-Alive\r\n\r\n"); metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123"); Tuple tuple = mock(Tuple.class); From 8af66b4290f23dada8906c3a5c6315a5f2381809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Fri, 15 May 2026 21:25:01 +0200 Subject: [PATCH 6/8] Fix Parameter name issues and make them error severity --- checkstyle.xml | 1 - .../stormcrawler/parse/filter/SubDocumentsParseFilter.java | 4 ++-- .../opensearch/parse/filter/JSONResourceWrapper.java | 4 ++-- .../apache/stormcrawler/opensearch/DelegateRefresherTest.java | 2 +- .../opensearch/parse/filter/JSONResourceWrapper.java | 4 ++-- .../apache/stormcrawler/opensearch/DelegateRefresherTest.java | 2 +- .../main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java | 4 ++-- 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index ac10c10d5..eb596bb85 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -272,7 +272,6 @@ under the License. - diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java index 6e9268318..ee831691d 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java @@ -39,7 +39,7 @@ public class SubDocumentsParseFilter extends ParseFilter { LoggerFactory.getLogger(SubDocumentsParseFilter.class); @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { InputStream stream = new ByteArrayInputStream(content); @@ -69,7 +69,7 @@ public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult } } } catch (Exception e) { - LOG.error("Error processing sitemap from {}: {}", URL, e); + LOG.error("Error processing sitemap from {}: {}", url, e); } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java index a5946cea3..6f6d7be66 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -66,8 +66,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - refresher.getDelegate().filter(URL, content, doc, parse); + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { + refresher.getDelegate().filter(url, content, doc, parse); } @Override diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java index 920ad5dea..67bcca62b 100644 --- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java @@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {} + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {} @Override public String getResourceFile() { diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java index 13fa40fba..d8995ff87 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -69,8 +69,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - refresher.getDelegate().filter(URL, content, doc, parse); + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { + refresher.getDelegate().filter(url, content, doc, parse); } @Override diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java index 920ad5dea..67bcca62b 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java @@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {} + public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {} @Override public String getResourceFile() { diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java index 8bb1937d3..5b77a7289 100644 --- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java +++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java @@ -59,8 +59,8 @@ public WARCHdfsBolt() { withFsUrl("file:///"); } - public WARCHdfsBolt withHeader(Map header_fields) { - this.header_fields = header_fields; + public WARCHdfsBolt withHeader(Map headerFields) { + this.header_fields = headerFields; return this; } From 22c0b2ee78a9e8a064f9755ed8e378d377af261b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Sun, 17 May 2026 16:38:29 +0200 Subject: [PATCH 7/8] Fix one top level class issues and make them error severity --- checkstyle.xml | 4 +- .../filtering/regex/FastURLFilter.java | 284 +++++++++--------- .../parse/filter/CollectionTagger.java | 115 +++---- .../stormcrawler/spout/MemorySpout.java | 68 +++-- .../stormcrawler/util/PerSecondReducer.java | 16 +- .../util/MetadataTransferTest.java | 23 +- 6 files changed, 257 insertions(+), 253 deletions(-) diff --git a/checkstyle.xml b/checkstyle.xml index eb596bb85..76b58c95f 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -95,9 +95,7 @@ under the License. - - - + diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 26c9afedd..038fe5e81 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -90,7 +90,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode // config via json failed - trying from global config if (this.resourceFile == null) { this.resourceFile = - ConfUtils.getString(stormConf, "fast.urlfilter.file", "fast.urlfilter.json"); + ConfUtils.getString(stormConf, "fast.urlfilter.file", "fast.urlfilter.json"); } try { @@ -108,7 +108,7 @@ public String getResourceFile() { @Override public void loadJSONResources(InputStream inputStream) - throws JsonParseException, JsonMappingException, IOException { + throws JsonParseException, JsonMappingException, IOException { JsonNode rootNode = objectMapper.readTree(inputStream); @@ -171,9 +171,9 @@ public void loadJSONResources(InputStream inputStream) @Override public @Nullable String filter( - @Nullable URL sourceUrl, - @Nullable Metadata sourceMetadata, - @NotNull String urlToFilter) { + @Nullable URL sourceUrl, + @Nullable Metadata sourceMetadata, + @NotNull String urlToFilter) { try { if (rules.filter(urlToFilter, sourceMetadata)) { return null; @@ -183,184 +183,184 @@ public void loadJSONResources(InputStream inputStream) } return urlToFilter; } -} - -class Rules { - - private Scope globalRules; - private Map domainRules = new HashMap<>(); - private Map hostNameRules = new HashMap<>(); - private List metadataRules = new ArrayList<>(); - - public void addScope(Scope s, Scope.Type t, String value) { - if (t.equals(Scope.Type.GLOBAL)) { - globalRules = s; - } else if (t.equals(Scope.Type.DOMAIN)) { - domainRules.put(value, s); - } else if (t.equals(Scope.Type.HOSTNAME)) { - hostNameRules.put(value, s); - } else if (t.equals(Scope.Type.METADATA)) { - metadataRules.add(new MDScope(value, s.getRules())); - } - } - /** - * Try the rules from the hostname, domain name, metadata and global scopes in this order. - * Returns true if the URL should be removed, false otherwise. The value returns the value of - * the first matching rule, be it positive or negative. - * - * @throws MalformedURLException - */ - public boolean filter(String url, Metadata metadata) throws MalformedURLException { - URL u = URLUtil.toURL(url); - - // first try the full hostname - String hostname = u.getHost(); - if (checkScope(hostNameRules.get(hostname), u)) { - return true; + static class Rules { + + private Scope globalRules; + private Map domainRules = new HashMap<>(); + private Map hostNameRules = new HashMap<>(); + private List metadataRules = new ArrayList<>(); + + public void addScope(Scope s, Scope.Type t, String value) { + if (t.equals(Scope.Type.GLOBAL)) { + globalRules = s; + } else if (t.equals(Scope.Type.DOMAIN)) { + domainRules.put(value, s); + } else if (t.equals(Scope.Type.HOSTNAME)) { + hostNameRules.put(value, s); + } else if (t.equals(Scope.Type.METADATA)) { + metadataRules.add(new MDScope(value, s.getRules())); + } } - // then on the various components of the domain - final String[] domainParts = hostname.split("\\."); - String domain = null; - for (int i = domainParts.length - 1; i >= 0; i--) { - domain = domainParts[i] + (domain == null ? "" : "." + domain); - if (checkScope(domainRules.get(domain), u)) { + /** + * Try the rules from the hostname, domain name, metadata and global scopes in this order. + * Returns true if the URL should be removed, false otherwise. The value returns the value of + * the first matching rule, be it positive or negative. + * + * @throws MalformedURLException + */ + public boolean filter(String url, Metadata metadata) throws MalformedURLException { + URL u = URLUtil.toURL(url); + + // first try the full hostname + String hostname = u.getHost(); + if (checkScope(hostNameRules.get(hostname), u)) { return true; } - } - // check on parent's URL metadata - for (MDScope scope : metadataRules) { - final String[] vals = metadata.getValues(scope.getKey()); - if (vals == null) { - continue; + // then on the various components of the domain + final String[] domainParts = hostname.split("\\."); + String domain = null; + for (int i = domainParts.length - 1; i >= 0; i--) { + domain = domainParts[i] + (domain == null ? "" : "." + domain); + if (checkScope(domainRules.get(domain), u)) { + return true; + } } - for (String v : vals) { - if (v.equalsIgnoreCase(scope.getValue())) { - FastURLFilter.LOG.debug( + + // check on parent's URL metadata + for (MDScope scope : metadataRules) { + final String[] vals = metadata.getValues(scope.getKey()); + if (vals == null) { + continue; + } + for (String v : vals) { + if (v.equalsIgnoreCase(scope.getValue())) { + FastURLFilter.LOG.debug( "Filtering {} matching metadata {}:{}", url, scope.getKey(), scope.getValue()); - if (checkScope(scope, u)) { - return true; + if (checkScope(scope, u)) { + return true; + } } } } - } - if (checkScope(globalRules, u)) { - return true; - } - - return false; - } + if (checkScope(globalRules, u)) { + return true; + } - private boolean checkScope(Scope s, URL u) { - if (s == null) { return false; } - for (Rule r : s.getRules()) { - String haystack = u.getPath(); - // whether to include the query as well? - if (r.getType().toString().endsWith("QUERY")) { - if (u.getQuery() != null) { - haystack += "?" + u.getQuery(); - } + + private boolean checkScope(Scope s, URL u) { + if (s == null) { + return false; } - if (r.getPattern().matcher(haystack).find()) { - // matches! returns true for DENY, false for ALLOW - return r.getType().toString().startsWith("DENY"); + for (Rule r : s.getRules()) { + String haystack = u.getPath(); + // whether to include the query as well? + if (r.getType().toString().endsWith("QUERY")) { + if (u.getQuery() != null) { + haystack += "?" + u.getQuery(); + } + } + if (r.getPattern().matcher(haystack).find()) { + // matches! returns true for DENY, false for ALLOW + return r.getType().toString().startsWith("DENY"); + } } + return false; } - return false; } -} -class Scope { + static class Scope { - public enum Type { - DOMAIN, - GLOBAL, - HOSTNAME, - METADATA - } + public enum Type { + DOMAIN, + GLOBAL, + HOSTNAME, + METADATA + } - protected Rule[] rules; + protected Rule[] rules; - public void setRules(List rlist) { - this.rules = rlist.toArray(new Rule[0]); - } + public void setRules(List rlist) { + this.rules = rlist.toArray(new Rule[0]); + } - public Rule[] getRules() { - return rules; + public Rule[] getRules() { + return rules; + } } -} -class MDScope extends Scope { + static class MDScope extends Scope { - private String key; - private String value; + private String key; + private String value; - MDScope(String constraint, Rule[] rules) { - this.rules = rules; - int eq = constraint.indexOf("="); - if (eq != -1) { - key = constraint.substring(0, eq); - value = constraint.substring(eq + 1); - } else { - key = constraint; + MDScope(String constraint, Rule[] rules) { + this.rules = rules; + int eq = constraint.indexOf("="); + if (eq != -1) { + key = constraint.substring(0, eq); + value = constraint.substring(eq + 1); + } else { + key = constraint; + } } - } - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public String getValue() { - return value; + public String getValue() { + return value; + } } -} -class Rule { + static class Rule { - public enum Type { - DENYPATH, - DENYPATHQUERY, - ALLOWPATH, - ALLOWPATHQUERY - } + public enum Type { + DENYPATH, + DENYPATHQUERY, + ALLOWPATH, + ALLOWPATHQUERY + } - private Type type; - private Pattern pattern; - - public Rule(String line) { - int offset = 0; - String lcline = line.toLowerCase(Locale.ROOT); - // separate the type from the pattern - for (Type t : Type.values()) { - String start = t.toString().toLowerCase(Locale.ROOT) + " "; - if (lcline.startsWith(start)) { - type = t; - offset = start.length(); - break; + private Type type; + private Pattern pattern; + + public Rule(String line) { + int offset = 0; + String lcline = line.toLowerCase(Locale.ROOT); + // separate the type from the pattern + for (Type t : Type.values()) { + String start = t.toString().toLowerCase(Locale.ROOT) + " "; + if (lcline.startsWith(start)) { + type = t; + offset = start.length(); + break; + } + } + // no match? + if (type == null) { + return; } - } - // no match? - if (type == null) { - return; - } - String patternString = line.substring(offset).trim(); - pattern = Pattern.compile(patternString); - } + String patternString = line.substring(offset).trim(); + pattern = Pattern.compile(patternString); + } - public Type getType() { - return type; - } + public Type getType() { + return type; + } - public Pattern getPattern() { - return pattern; + public Pattern getPattern() { + return pattern; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java index ba5c5cd05..64831db8c 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java @@ -70,7 +70,8 @@ public class CollectionTagger extends ParseFilter implements JSONResource { private static final Logger LOG = LoggerFactory.getLogger(CollectionTagger.class); private static final ObjectMapper objectMapper = new ObjectMapper(); - private static final TypeReference reference = new TypeReference<>() {}; + private static final TypeReference reference = new TypeReference<>() { + }; private Collections collections = new Collections(); @@ -96,7 +97,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } if (this.resourceFile == null) { this.resourceFile = - ConfUtils.getString(stormConf, "collections.file", "collections.json"); + ConfUtils.getString(stormConf, "collections.file", "collections.json"); } try { @@ -114,7 +115,7 @@ public String getResourceFile() { @Override public void loadJSONResources(InputStream inputStream) - throws JsonParseException, JsonMappingException, IOException { + throws JsonParseException, JsonMappingException, IOException { collections = (Collections) objectMapper.readValue(inputStream, reference); } @@ -125,78 +126,78 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse.get(url).getMetadata().setValues(key, tags); } } -} -class Collections { + static class Collections { - private Set collections; + private Set collections; - public void setCollections(Set collections) { - this.collections = collections; - } + public void setCollections(Set collections) { + this.collections = collections; + } - public String[] tag(String url) { - Set tags = new HashSet<>(); - for (Collection collection : collections) { - if (collection.matches(url)) { - tags.add(collection.getName()); + public String[] tag(String url) { + Set tags = new HashSet<>(); + for (Collection collection : collections) { + if (collection.matches(url)) { + tags.add(collection.getName()); + } } + return tags.toArray(new String[0]); } - return tags.toArray(new String[0]); } -} - -class Collection { - - private String name; - private Set includePatterns; - private Set excludePatterns; - public String getName() { - return name; - } + static class Collection { - public void setName(String name) { - this.name = name; - } + private String name; + private Set includePatterns; + private Set excludePatterns; - /** - * @return true if the URL matches a pattern for this collection and no exclusion patterns - */ - public boolean matches(String url) { - boolean matches = false; - for (Pattern includeP : includePatterns) { - Matcher m = includeP.matcher(url); - if (m.matches()) { - matches = true; - break; - } - } - // no match - if (!matches) { - return false; + public String getName() { + return name; } - if (excludePatterns == null) { - return true; + public void setName(String name) { + this.name = name; } - // check for antipatterns - for (Pattern excludeP : excludePatterns) { - Matcher m = excludeP.matcher(url); - if (m.matches()) { + /** + * @return true if the URL matches a pattern for this collection and no exclusion patterns + */ + public boolean matches(String url) { + boolean matches = false; + for (Pattern includeP : includePatterns) { + Matcher m = includeP.matcher(url); + if (m.matches()) { + matches = true; + break; + } + } + // no match + if (!matches) { return false; } - } - return true; - } + if (excludePatterns == null) { + return true; + } - public void setIncludePatterns(Set includePatterns) { - this.includePatterns = includePatterns; - } + // check for antipatterns + for (Pattern excludeP : excludePatterns) { + Matcher m = excludeP.matcher(url); + if (m.matches()) { + return false; + } + } - public void setExcludePatterns(Set excludePatterns) { - this.excludePatterns = excludePatterns; + return true; + } + + public void setIncludePatterns(Set includePatterns) { + this.includePatterns = includePatterns; + } + + public void setExcludePatterns(Set excludePatterns) { + this.excludePatterns = excludePatterns; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java index 221dae683..9848a6c74 100644 --- a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java +++ b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java @@ -66,14 +66,16 @@ public MemorySpout(String... urls) { * statusupdaterbolt. * * @param withDiscoveredStatus whether the tuples generated should contain a Status field with - * DISCOVERED as value and be emitted on the status stream + * DISCOVERED as value and be emitted on the status stream */ public MemorySpout(boolean withDiscoveredStatus, String... urls) { this.withDiscoveredStatus = withDiscoveredStatus; startingUrls = urls; } - /** Add a new URL with the given metadata and nextFetch-date. */ + /** + * Add a new URL with the given metadata and nextFetch-date. + */ public static void add(String url, Metadata md, Date nextFetch) { LOG.debug("Adding {} with md {} and nextFetch {}", url, md, nextFetch); ScheduledURL tuple = new ScheduledURL(url, md, nextFetch); @@ -84,7 +86,7 @@ public static void add(String url, Metadata md, Date nextFetch) { @Override public void open( - Map conf, TopologyContext context, SpoutOutputCollector collector) { + Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; // check that there is only one instance of it @@ -97,7 +99,7 @@ public void open( for (String u : startingUrls) { LOG.debug("About to deserialize {} ", u); List tuple = - scheme.deserialize(ByteBuffer.wrap(u.getBytes(StandardCharsets.UTF_8))); + scheme.deserialize(ByteBuffer.wrap(u.getBytes(StandardCharsets.UTF_8))); add((String) tuple.get(0), (Metadata) tuple.get(1), now); } CrawlerMetrics.registerGauge(context, conf, "queue_size", queue::size, 10); @@ -160,39 +162,41 @@ public void deactivate() { super.deactivate(); active = false; } -} - -class ScheduledURL implements Comparable { - Date nextFetchDate; - String url; - Metadata metadata; - ScheduledURL(String url, Metadata m, Date nextFetchDate) { - this.nextFetchDate = nextFetchDate; - this.url = url; - this.metadata = m; - } - - @Override - public String toString() { - return url + "\t" + nextFetchDate; - } + static class ScheduledURL implements Comparable { + Date nextFetchDate; + String url; + Metadata metadata; - /** Sort by next fetch date then URl. * */ - @Override - public int compareTo(ScheduledURL o) { - // compare the URL - int compString = url.compareTo(o.url); - if (compString == 0) { - return 0; + ScheduledURL(String url, Metadata m, Date nextFetchDate) { + this.nextFetchDate = nextFetchDate; + this.url = url; + this.metadata = m; } - // compare the date - int comp = nextFetchDate.compareTo(o.nextFetchDate); - if (comp != 0) { - return comp; + @Override + public String toString() { + return url + "\t" + nextFetchDate; } - return compString; + /** + * Sort by next fetch date then URl. * + */ + @Override + public int compareTo(ScheduledURL o) { + // compare the URL + int compString = url.compareTo(o.url); + if (compString == 0) { + return 0; + } + + // compare the date + int comp = nextFetchDate.compareTo(o.nextFetchDate); + if (comp != 0) { + return comp; + } + + return compString; + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java index 8c886152a..b698e7ed5 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java +++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java @@ -20,7 +20,7 @@ import org.apache.storm.metric.api.IReducer; /** Used to return an average value per second. */ -public class PerSecondReducer implements IReducer { +public class PerSecondReducer implements IReducer { @Override public TimeReducerState init() { @@ -37,9 +37,9 @@ public TimeReducerState reduce(TimeReducerState accumulator, Object input) { accumulator.sum += ((Integer) input).doubleValue(); } else { throw new RuntimeException( - "MeanReducer::reduce called with unsupported input type `" - + input.getClass() - + "`. Supported types are Double, Long, Integer."); + "MeanReducer::reduce called with unsupported input type `" + + input.getClass() + + "`. Supported types are Double, Long, Integer."); } return accumulator; } @@ -54,9 +54,9 @@ public Object extractResult(TimeReducerState accumulator) { double permsec = accumulator.sum / msec; return permsec * 1000d; } -} -class TimeReducerState { - public long started = System.currentTimeMillis(); - public double sum = 0.0; + static class TimeReducerState { + public long started = System.currentTimeMillis(); + public double sum = 0.0; + } } diff --git a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java index d76f83d3b..c985eac95 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java @@ -39,16 +39,16 @@ void testTransfer() throws MalformedURLException { parentMD.addValue("cookie.source", "example.com"); parentMD.addValue("fetchInterval", "200"); Metadata outlinkMD = - mdt.getMetaForOutlink( - "http://www.example.com/outlink.html", "http://www.example.com", parentMD); + mdt.getMetaForOutlink( + "http://www.example.com/outlink.html", "http://www.example.com", parentMD); // test the value of track seed, depth and fetch fields Assertions.assertEquals("1", outlinkMD.getFirstValue(MetadataTransfer.depthKeyName)); Set expectedFields = - Set.of( - MetadataTransfer.urlPathKeyName, - MetadataTransfer.depthKeyName, - "cookie.id", - "cookie.source"); + Set.of( + MetadataTransfer.urlPathKeyName, + MetadataTransfer.depthKeyName, + "cookie.id", + "cookie.source"); Assertions.assertEquals(expectedFields, outlinkMD.keySet()); String[] urlpath = outlinkMD.getValues(MetadataTransfer.urlPathKeyName); Assertions.assertEquals(1, urlpath.length); @@ -67,8 +67,8 @@ void testCustomTransferClass() throws MalformedURLException { Assertions.assertTrue(hasThrownException); conf = new HashMap<>(); conf.put( - MetadataTransfer.metadataTransferClassParamName, - MyCustomTransferClass.class.getName()); + MetadataTransfer.metadataTransferClassParamName, + MyCustomTransferClass.class.getName()); hasThrownException = false; try { MetadataTransfer.getInstance(conf); @@ -112,6 +112,7 @@ void testFilterWithAsterisk() { filteredMetadata = mdt.filter(metadata); Assertions.assertEquals(6, filteredMetadata.size()); } -} -class MyCustomTransferClass extends MetadataTransfer {} + static class MyCustomTransferClass extends MetadataTransfer { + } +} From c75b519425d4f4281986dad7e212f960107dd3a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Szigecs=C3=A1n?= Date: Sun, 17 May 2026 18:41:25 +0200 Subject: [PATCH 8/8] Fix code format --- .../filtering/regex/FastURLFilter.java | 22 +++++++++---------- .../parse/filter/CollectionTagger.java | 7 +++--- .../stormcrawler/spout/MemorySpout.java | 14 +++++------- .../stormcrawler/util/PerSecondReducer.java | 6 ++--- .../util/MetadataTransferTest.java | 21 +++++++++--------- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 038fe5e81..aff263f4a 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -90,7 +90,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode // config via json failed - trying from global config if (this.resourceFile == null) { this.resourceFile = - ConfUtils.getString(stormConf, "fast.urlfilter.file", "fast.urlfilter.json"); + ConfUtils.getString(stormConf, "fast.urlfilter.file", "fast.urlfilter.json"); } try { @@ -108,7 +108,7 @@ public String getResourceFile() { @Override public void loadJSONResources(InputStream inputStream) - throws JsonParseException, JsonMappingException, IOException { + throws JsonParseException, JsonMappingException, IOException { JsonNode rootNode = objectMapper.readTree(inputStream); @@ -171,9 +171,9 @@ public void loadJSONResources(InputStream inputStream) @Override public @Nullable String filter( - @Nullable URL sourceUrl, - @Nullable Metadata sourceMetadata, - @NotNull String urlToFilter) { + @Nullable URL sourceUrl, + @Nullable Metadata sourceMetadata, + @NotNull String urlToFilter) { try { if (rules.filter(urlToFilter, sourceMetadata)) { return null; @@ -205,8 +205,8 @@ public void addScope(Scope s, Scope.Type t, String value) { /** * Try the rules from the hostname, domain name, metadata and global scopes in this order. - * Returns true if the URL should be removed, false otherwise. The value returns the value of - * the first matching rule, be it positive or negative. + * Returns true if the URL should be removed, false otherwise. The value returns the value + * of the first matching rule, be it positive or negative. * * @throws MalformedURLException */ @@ -238,10 +238,10 @@ public boolean filter(String url, Metadata metadata) throws MalformedURLExceptio for (String v : vals) { if (v.equalsIgnoreCase(scope.getValue())) { FastURLFilter.LOG.debug( - "Filtering {} matching metadata {}:{}", - url, - scope.getKey(), - scope.getValue()); + "Filtering {} matching metadata {}:{}", + url, + scope.getKey(), + scope.getValue()); if (checkScope(scope, u)) { return true; } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java index 64831db8c..9181ff853 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java @@ -70,8 +70,7 @@ public class CollectionTagger extends ParseFilter implements JSONResource { private static final Logger LOG = LoggerFactory.getLogger(CollectionTagger.class); private static final ObjectMapper objectMapper = new ObjectMapper(); - private static final TypeReference reference = new TypeReference<>() { - }; + private static final TypeReference reference = new TypeReference<>() {}; private Collections collections = new Collections(); @@ -97,7 +96,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode } if (this.resourceFile == null) { this.resourceFile = - ConfUtils.getString(stormConf, "collections.file", "collections.json"); + ConfUtils.getString(stormConf, "collections.file", "collections.json"); } try { @@ -115,7 +114,7 @@ public String getResourceFile() { @Override public void loadJSONResources(InputStream inputStream) - throws JsonParseException, JsonMappingException, IOException { + throws JsonParseException, JsonMappingException, IOException { collections = (Collections) objectMapper.readValue(inputStream, reference); } diff --git a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java index 9848a6c74..6c9182c50 100644 --- a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java +++ b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java @@ -66,16 +66,14 @@ public MemorySpout(String... urls) { * statusupdaterbolt. * * @param withDiscoveredStatus whether the tuples generated should contain a Status field with - * DISCOVERED as value and be emitted on the status stream + * DISCOVERED as value and be emitted on the status stream */ public MemorySpout(boolean withDiscoveredStatus, String... urls) { this.withDiscoveredStatus = withDiscoveredStatus; startingUrls = urls; } - /** - * Add a new URL with the given metadata and nextFetch-date. - */ + /** Add a new URL with the given metadata and nextFetch-date. */ public static void add(String url, Metadata md, Date nextFetch) { LOG.debug("Adding {} with md {} and nextFetch {}", url, md, nextFetch); ScheduledURL tuple = new ScheduledURL(url, md, nextFetch); @@ -86,7 +84,7 @@ public static void add(String url, Metadata md, Date nextFetch) { @Override public void open( - Map conf, TopologyContext context, SpoutOutputCollector collector) { + Map conf, TopologyContext context, SpoutOutputCollector collector) { this.collector = collector; // check that there is only one instance of it @@ -99,7 +97,7 @@ public void open( for (String u : startingUrls) { LOG.debug("About to deserialize {} ", u); List tuple = - scheme.deserialize(ByteBuffer.wrap(u.getBytes(StandardCharsets.UTF_8))); + scheme.deserialize(ByteBuffer.wrap(u.getBytes(StandardCharsets.UTF_8))); add((String) tuple.get(0), (Metadata) tuple.get(1), now); } CrawlerMetrics.registerGauge(context, conf, "queue_size", queue::size, 10); @@ -179,9 +177,7 @@ public String toString() { return url + "\t" + nextFetchDate; } - /** - * Sort by next fetch date then URl. * - */ + /** Sort by next fetch date then URl. * */ @Override public int compareTo(ScheduledURL o) { // compare the URL diff --git a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java index b698e7ed5..cbb8a140e 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java +++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java @@ -37,9 +37,9 @@ public TimeReducerState reduce(TimeReducerState accumulator, Object input) { accumulator.sum += ((Integer) input).doubleValue(); } else { throw new RuntimeException( - "MeanReducer::reduce called with unsupported input type `" - + input.getClass() - + "`. Supported types are Double, Long, Integer."); + "MeanReducer::reduce called with unsupported input type `" + + input.getClass() + + "`. Supported types are Double, Long, Integer."); } return accumulator; } diff --git a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java index c985eac95..b2b297ad1 100644 --- a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java +++ b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java @@ -39,16 +39,16 @@ void testTransfer() throws MalformedURLException { parentMD.addValue("cookie.source", "example.com"); parentMD.addValue("fetchInterval", "200"); Metadata outlinkMD = - mdt.getMetaForOutlink( - "http://www.example.com/outlink.html", "http://www.example.com", parentMD); + mdt.getMetaForOutlink( + "http://www.example.com/outlink.html", "http://www.example.com", parentMD); // test the value of track seed, depth and fetch fields Assertions.assertEquals("1", outlinkMD.getFirstValue(MetadataTransfer.depthKeyName)); Set expectedFields = - Set.of( - MetadataTransfer.urlPathKeyName, - MetadataTransfer.depthKeyName, - "cookie.id", - "cookie.source"); + Set.of( + MetadataTransfer.urlPathKeyName, + MetadataTransfer.depthKeyName, + "cookie.id", + "cookie.source"); Assertions.assertEquals(expectedFields, outlinkMD.keySet()); String[] urlpath = outlinkMD.getValues(MetadataTransfer.urlPathKeyName); Assertions.assertEquals(1, urlpath.length); @@ -67,8 +67,8 @@ void testCustomTransferClass() throws MalformedURLException { Assertions.assertTrue(hasThrownException); conf = new HashMap<>(); conf.put( - MetadataTransfer.metadataTransferClassParamName, - MyCustomTransferClass.class.getName()); + MetadataTransfer.metadataTransferClassParamName, + MyCustomTransferClass.class.getName()); hasThrownException = false; try { MetadataTransfer.getInstance(conf); @@ -113,6 +113,5 @@ void testFilterWithAsterisk() { Assertions.assertEquals(6, filteredMetadata.size()); } - static class MyCustomTransferClass extends MetadataTransfer { - } + static class MyCustomTransferClass extends MetadataTransfer {} }