diff --git a/checkstyle.xml b/checkstyle.xml
index a10df412a..76b58c95f 100644
--- a/checkstyle.xml
+++ b/checkstyle.xml
@@ -95,9 +95,7 @@ under the License.
-
-
-
+
@@ -254,7 +252,6 @@ under the License.
-
-
-
@@ -387,7 +382,6 @@ under the License.
LITERAL_SWITCH, LITERAL_SYNCHRONIZED, LITERAL_WHILE, METHOD_CALL,
METHOD_DEF, QUESTION, RESOURCE_SPECIFICATION, SUPER_CTOR_CALL, LAMBDA,
RECORD_DEF, RECORD_PATTERN_DEF"/>
-
@@ -410,9 +404,7 @@ under the License.
-
-
-
+
@@ -470,7 +462,6 @@ under the License.
value="^(?![a-z]$)(?![a-z][A-Z])[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)*(?:_[0-9]+)*$"/>
-
diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
index 26c9afedd..aff263f4a 100644
--- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
+++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -183,184 +183,184 @@ public void loadJSONResources(InputStream inputStream)
}
return urlToFilter;
}
-}
-
-class Rules {
-
- private Scope globalRules;
- private Map domainRules = new HashMap<>();
- private Map hostNameRules = new HashMap<>();
- private List metadataRules = new ArrayList<>();
-
- public void addScope(Scope s, Scope.Type t, String value) {
- if (t.equals(Scope.Type.GLOBAL)) {
- globalRules = s;
- } else if (t.equals(Scope.Type.DOMAIN)) {
- domainRules.put(value, s);
- } else if (t.equals(Scope.Type.HOSTNAME)) {
- hostNameRules.put(value, s);
- } else if (t.equals(Scope.Type.METADATA)) {
- metadataRules.add(new MDScope(value, s.getRules()));
- }
- }
- /**
- * Try the rules from the hostname, domain name, metadata and global scopes in this order.
- * Returns true if the URL should be removed, false otherwise. The value returns the value of
- * the first matching rule, be it positive or negative.
- *
- * @throws MalformedURLException
- */
- public boolean filter(String url, Metadata metadata) throws MalformedURLException {
- URL u = URLUtil.toURL(url);
-
- // first try the full hostname
- String hostname = u.getHost();
- if (checkScope(hostNameRules.get(hostname), u)) {
- return true;
+ static class Rules {
+
+ private Scope globalRules;
+ private Map domainRules = new HashMap<>();
+ private Map hostNameRules = new HashMap<>();
+ private List metadataRules = new ArrayList<>();
+
+ public void addScope(Scope s, Scope.Type t, String value) {
+ if (t.equals(Scope.Type.GLOBAL)) {
+ globalRules = s;
+ } else if (t.equals(Scope.Type.DOMAIN)) {
+ domainRules.put(value, s);
+ } else if (t.equals(Scope.Type.HOSTNAME)) {
+ hostNameRules.put(value, s);
+ } else if (t.equals(Scope.Type.METADATA)) {
+ metadataRules.add(new MDScope(value, s.getRules()));
+ }
}
- // then on the various components of the domain
- final String[] domainParts = hostname.split("\\.");
- String domain = null;
- for (int i = domainParts.length - 1; i >= 0; i--) {
- domain = domainParts[i] + (domain == null ? "" : "." + domain);
- if (checkScope(domainRules.get(domain), u)) {
+ /**
+ * Try the rules from the hostname, domain name, metadata and global scopes in this order.
+ * Returns true if the URL should be removed, false otherwise. The value returns the value
+ * of the first matching rule, be it positive or negative.
+ *
+ * @throws MalformedURLException
+ */
+ public boolean filter(String url, Metadata metadata) throws MalformedURLException {
+ URL u = URLUtil.toURL(url);
+
+ // first try the full hostname
+ String hostname = u.getHost();
+ if (checkScope(hostNameRules.get(hostname), u)) {
return true;
}
- }
- // check on parent's URL metadata
- for (MDScope scope : metadataRules) {
- final String[] vals = metadata.getValues(scope.getKey());
- if (vals == null) {
- continue;
+ // then on the various components of the domain
+ final String[] domainParts = hostname.split("\\.");
+ String domain = null;
+ for (int i = domainParts.length - 1; i >= 0; i--) {
+ domain = domainParts[i] + (domain == null ? "" : "." + domain);
+ if (checkScope(domainRules.get(domain), u)) {
+ return true;
+ }
}
- for (String v : vals) {
- if (v.equalsIgnoreCase(scope.getValue())) {
- FastURLFilter.LOG.debug(
- "Filtering {} matching metadata {}:{}",
- url,
- scope.getKey(),
- scope.getValue());
- if (checkScope(scope, u)) {
- return true;
+
+ // check on parent's URL metadata
+ for (MDScope scope : metadataRules) {
+ final String[] vals = metadata.getValues(scope.getKey());
+ if (vals == null) {
+ continue;
+ }
+ for (String v : vals) {
+ if (v.equalsIgnoreCase(scope.getValue())) {
+ FastURLFilter.LOG.debug(
+ "Filtering {} matching metadata {}:{}",
+ url,
+ scope.getKey(),
+ scope.getValue());
+ if (checkScope(scope, u)) {
+ return true;
+ }
}
}
}
- }
- if (checkScope(globalRules, u)) {
- return true;
- }
-
- return false;
- }
+ if (checkScope(globalRules, u)) {
+ return true;
+ }
- private boolean checkScope(Scope s, URL u) {
- if (s == null) {
return false;
}
- for (Rule r : s.getRules()) {
- String haystack = u.getPath();
- // whether to include the query as well?
- if (r.getType().toString().endsWith("QUERY")) {
- if (u.getQuery() != null) {
- haystack += "?" + u.getQuery();
- }
+
+ private boolean checkScope(Scope s, URL u) {
+ if (s == null) {
+ return false;
}
- if (r.getPattern().matcher(haystack).find()) {
- // matches! returns true for DENY, false for ALLOW
- return r.getType().toString().startsWith("DENY");
+ for (Rule r : s.getRules()) {
+ String haystack = u.getPath();
+ // whether to include the query as well?
+ if (r.getType().toString().endsWith("QUERY")) {
+ if (u.getQuery() != null) {
+ haystack += "?" + u.getQuery();
+ }
+ }
+ if (r.getPattern().matcher(haystack).find()) {
+ // matches! returns true for DENY, false for ALLOW
+ return r.getType().toString().startsWith("DENY");
+ }
}
+ return false;
}
- return false;
}
-}
-class Scope {
+ static class Scope {
- public enum Type {
- DOMAIN,
- GLOBAL,
- HOSTNAME,
- METADATA
- }
+ public enum Type {
+ DOMAIN,
+ GLOBAL,
+ HOSTNAME,
+ METADATA
+ }
- protected Rule[] rules;
+ protected Rule[] rules;
- public void setRules(List rlist) {
- this.rules = rlist.toArray(new Rule[0]);
- }
+ public void setRules(List rlist) {
+ this.rules = rlist.toArray(new Rule[0]);
+ }
- public Rule[] getRules() {
- return rules;
+ public Rule[] getRules() {
+ return rules;
+ }
}
-}
-class MDScope extends Scope {
+ static class MDScope extends Scope {
- private String key;
- private String value;
+ private String key;
+ private String value;
- MDScope(String constraint, Rule[] rules) {
- this.rules = rules;
- int eq = constraint.indexOf("=");
- if (eq != -1) {
- key = constraint.substring(0, eq);
- value = constraint.substring(eq + 1);
- } else {
- key = constraint;
+ MDScope(String constraint, Rule[] rules) {
+ this.rules = rules;
+ int eq = constraint.indexOf("=");
+ if (eq != -1) {
+ key = constraint.substring(0, eq);
+ value = constraint.substring(eq + 1);
+ } else {
+ key = constraint;
+ }
}
- }
- public String getKey() {
- return key;
- }
+ public String getKey() {
+ return key;
+ }
- public String getValue() {
- return value;
+ public String getValue() {
+ return value;
+ }
}
-}
-class Rule {
+ static class Rule {
- public enum Type {
- DENYPATH,
- DENYPATHQUERY,
- ALLOWPATH,
- ALLOWPATHQUERY
- }
+ public enum Type {
+ DENYPATH,
+ DENYPATHQUERY,
+ ALLOWPATH,
+ ALLOWPATHQUERY
+ }
- private Type type;
- private Pattern pattern;
-
- public Rule(String line) {
- int offset = 0;
- String lcline = line.toLowerCase(Locale.ROOT);
- // separate the type from the pattern
- for (Type t : Type.values()) {
- String start = t.toString().toLowerCase(Locale.ROOT) + " ";
- if (lcline.startsWith(start)) {
- type = t;
- offset = start.length();
- break;
+ private Type type;
+ private Pattern pattern;
+
+ public Rule(String line) {
+ int offset = 0;
+ String lcline = line.toLowerCase(Locale.ROOT);
+ // separate the type from the pattern
+ for (Type t : Type.values()) {
+ String start = t.toString().toLowerCase(Locale.ROOT) + " ";
+ if (lcline.startsWith(start)) {
+ type = t;
+ offset = start.length();
+ break;
+ }
+ }
+ // no match?
+ if (type == null) {
+ return;
}
- }
- // no match?
- if (type == null) {
- return;
- }
- String patternString = line.substring(offset).trim();
- pattern = Pattern.compile(patternString);
- }
+ String patternString = line.substring(offset).trim();
+ pattern = Pattern.compile(patternString);
+ }
- public Type getType() {
- return type;
- }
+ public Type getType() {
+ return type;
+ }
- public Pattern getPattern() {
- return pattern;
+ public Pattern getPattern() {
+ return pattern;
+ }
}
}
diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java
index ba5c5cd05..9181ff853 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/CollectionTagger.java
@@ -125,78 +125,78 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult
parse.get(url).getMetadata().setValues(key, tags);
}
}
-}
-class Collections {
+ static class Collections {
- private Set collections;
+ private Set collections;
- public void setCollections(Set collections) {
- this.collections = collections;
- }
+ public void setCollections(Set collections) {
+ this.collections = collections;
+ }
- public String[] tag(String url) {
- Set tags = new HashSet<>();
- for (Collection collection : collections) {
- if (collection.matches(url)) {
- tags.add(collection.getName());
+ public String[] tag(String url) {
+ Set tags = new HashSet<>();
+ for (Collection collection : collections) {
+ if (collection.matches(url)) {
+ tags.add(collection.getName());
+ }
}
+ return tags.toArray(new String[0]);
}
- return tags.toArray(new String[0]);
}
-}
-
-class Collection {
-
- private String name;
- private Set includePatterns;
- private Set excludePatterns;
- public String getName() {
- return name;
- }
+ static class Collection {
- public void setName(String name) {
- this.name = name;
- }
+ private String name;
+ private Set includePatterns;
+ private Set excludePatterns;
- /**
- * @return true if the URL matches a pattern for this collection and no exclusion patterns
- */
- public boolean matches(String url) {
- boolean matches = false;
- for (Pattern includeP : includePatterns) {
- Matcher m = includeP.matcher(url);
- if (m.matches()) {
- matches = true;
- break;
- }
- }
- // no match
- if (!matches) {
- return false;
+ public String getName() {
+ return name;
}
- if (excludePatterns == null) {
- return true;
+ public void setName(String name) {
+ this.name = name;
}
- // check for antipatterns
- for (Pattern excludeP : excludePatterns) {
- Matcher m = excludeP.matcher(url);
- if (m.matches()) {
+ /**
+ * @return true if the URL matches a pattern for this collection and no exclusion patterns
+ */
+ public boolean matches(String url) {
+ boolean matches = false;
+ for (Pattern includeP : includePatterns) {
+ Matcher m = includeP.matcher(url);
+ if (m.matches()) {
+ matches = true;
+ break;
+ }
+ }
+ // no match
+ if (!matches) {
return false;
}
- }
- return true;
- }
+ if (excludePatterns == null) {
+ return true;
+ }
- public void setIncludePatterns(Set includePatterns) {
- this.includePatterns = includePatterns;
- }
+ // check for antipatterns
+ for (Pattern excludeP : excludePatterns) {
+ Matcher m = excludeP.matcher(url);
+ if (m.matches()) {
+ return false;
+ }
+ }
- public void setExcludePatterns(Set excludePatterns) {
- this.excludePatterns = excludePatterns;
+ return true;
+ }
+
+ public void setIncludePatterns(Set includePatterns) {
+ this.includePatterns = includePatterns;
+ }
+
+ public void setExcludePatterns(Set excludePatterns) {
+ this.excludePatterns = excludePatterns;
+ }
}
}
diff --git a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java
index 221dae683..6c9182c50 100644
--- a/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java
+++ b/core/src/main/java/org/apache/stormcrawler/spout/MemorySpout.java
@@ -160,39 +160,39 @@ public void deactivate() {
super.deactivate();
active = false;
}
-}
-
-class ScheduledURL implements Comparable {
- Date nextFetchDate;
- String url;
- Metadata metadata;
-
- ScheduledURL(String url, Metadata m, Date nextFetchDate) {
- this.nextFetchDate = nextFetchDate;
- this.url = url;
- this.metadata = m;
- }
- @Override
- public String toString() {
- return url + "\t" + nextFetchDate;
- }
+ static class ScheduledURL implements Comparable {
+ Date nextFetchDate;
+ String url;
+ Metadata metadata;
- /** Sort by next fetch date then URl. * */
- @Override
- public int compareTo(ScheduledURL o) {
- // compare the URL
- int compString = url.compareTo(o.url);
- if (compString == 0) {
- return 0;
+ ScheduledURL(String url, Metadata m, Date nextFetchDate) {
+ this.nextFetchDate = nextFetchDate;
+ this.url = url;
+ this.metadata = m;
}
- // compare the date
- int comp = nextFetchDate.compareTo(o.nextFetchDate);
- if (comp != 0) {
- return comp;
+ @Override
+ public String toString() {
+ return url + "\t" + nextFetchDate;
}
- return compString;
+ /** Sort by next fetch date then URl. * */
+ @Override
+ public int compareTo(ScheduledURL o) {
+ // compare the URL
+ int compString = url.compareTo(o.url);
+ if (compString == 0) {
+ return 0;
+ }
+
+ // compare the date
+ int comp = nextFetchDate.compareTo(o.nextFetchDate);
+ if (comp != 0) {
+ return comp;
+ }
+
+ return compString;
+ }
}
}
diff --git a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java
index 3a6fd83e2..38592bfee 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/MetadataTransfer.java
@@ -123,7 +123,7 @@ protected void configure(Map conf) {
* the URL path.
*/
public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata parentMetadata) {
- Metadata md = _filter(parentMetadata, mdToTransfer);
+ Metadata md = filter(parentMetadata, mdToTransfer);
// keep the path?
if (trackPath) {
@@ -150,11 +150,11 @@ public Metadata getMetaForOutlink(String targetUrl, String sourceUrl, Metadata p
* not necessarily transferred to the outlinks.
*/
public Metadata filter(Metadata metadata) {
- Metadata filteredMetadata = _filter(metadata, mdToTransfer);
+ Metadata filteredMetadata = filter(metadata, mdToTransfer);
// add the features that are only persisted but
// not transferred like __redirTo_
- filteredMetadata.putAll(_filter(metadata, mdToPersistOnly));
+ filteredMetadata.putAll(filter(metadata, mdToPersistOnly));
return filteredMetadata;
}
@@ -163,7 +163,7 @@ public Metadata filter(Metadata metadata) {
* Filter the metadata based on a set of keys. If a key ends with a * then all the keys starting
* with the prefix will be added.
*/
- private Metadata _filter(Metadata metadata, Set filter) {
+ private Metadata filter(Metadata metadata, Set filter) {
Metadata filteredMetadata = new Metadata();
for (String key : filter) {
diff --git a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
index 8c886152a..cbb8a140e 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
@@ -20,7 +20,7 @@
import org.apache.storm.metric.api.IReducer;
/** Used to return an average value per second. */
-public class PerSecondReducer implements IReducer {
+public class PerSecondReducer implements IReducer {
@Override
public TimeReducerState init() {
@@ -54,9 +54,9 @@ public Object extractResult(TimeReducerState accumulator) {
double permsec = accumulator.sum / msec;
return permsec * 1000d;
}
-}
-class TimeReducerState {
- public long started = System.currentTimeMillis();
- public double sum = 0.0;
+ static class TimeReducerState {
+ public long started = System.currentTimeMillis();
+ public double sum = 0.0;
+ }
}
diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
index cca978a59..2942c6a56 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
@@ -115,8 +115,8 @@ void setupParserBolt() {
setupParserBolt(bolt);
}
+ /** Checks that content in script is not included in the text representation. */
@Test
- /** Checks that content in script is not included in the text representation */
void testNoScriptInText() throws IOException {
bolt.prepare(
new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
@@ -129,8 +129,8 @@ void testNoScriptInText() throws IOException {
"Text should not contain the content of script tags");
}
+ /** Checks that individual links marked as rel="nofollow" are not followed. */
@Test
- /** Checks that individual links marked as rel="nofollow" are not followed */
void testNoFollowOutlinks() throws IOException {
bolt.prepare(
new HashMap<>(), TestUtil.getMockedTopologyContext(), new OutputCollector(output));
diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java
index 6e9268318..ee831691d 100644
--- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java
+++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsParseFilter.java
@@ -39,7 +39,7 @@ public class SubDocumentsParseFilter extends ParseFilter {
LoggerFactory.getLogger(SubDocumentsParseFilter.class);
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
+ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {
InputStream stream = new ByteArrayInputStream(content);
@@ -69,7 +69,7 @@ public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult
}
}
} catch (Exception e) {
- LOG.error("Error processing sitemap from {}: {}", URL, e);
+ LOG.error("Error processing sitemap from {}: {}", url, e);
}
}
diff --git a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java
index 29351fd7e..639ae2877 100644
--- a/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/util/InitialisationUtilTest.java
@@ -32,7 +32,7 @@
class InitialisationUtilTest {
@Test
- void can_initialize_a_simple_class() {
+ void can_initialize_simple_class() {
final SimpleOpenClass simpleOpenClass =
InitialisationUtil.initializeFromQualifiedName(
SimpleOpenClass.class.getName(), SimpleOpenClass.class);
@@ -150,7 +150,7 @@ void fails_if_qualified_class_name_is_blank() {
}
@Test
- void fails_if_class_to_initialize_not_extending_classes_to_test_1() {
+ void fails_if_class_to_initialize_not_extending_classes_to_test() {
Assertions.assertThrows(
RuntimeException.class,
() ->
@@ -161,7 +161,7 @@ void fails_if_class_to_initialize_not_extending_classes_to_test_1() {
}
@Test
- void fails_if_class_to_initialize_not_extending_classes_to_test_2() {
+ void fails_if_class_to_initialize_not_extending_classes_to_test_other() {
Assertions.assertThrows(
RuntimeException.class,
() ->
diff --git a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java
index d28a6d941..b2b297ad1 100644
--- a/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/util/MetadataTransferTest.java
@@ -68,7 +68,7 @@ void testCustomTransferClass() throws MalformedURLException {
conf = new HashMap<>();
conf.put(
MetadataTransfer.metadataTransferClassParamName,
- myCustomTransferClass.class.getName());
+ MyCustomTransferClass.class.getName());
hasThrownException = false;
try {
MetadataTransfer.getInstance(conf);
@@ -112,6 +112,6 @@ void testFilterWithAsterisk() {
filteredMetadata = mdt.filter(metadata);
Assertions.assertEquals(6, filteredMetadata.size());
}
-}
-class myCustomTransferClass extends MetadataTransfer {}
+ static class MyCustomTransferClass extends MetadataTransfer {}
+}
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
index a5946cea3..6f6d7be66 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
@@ -66,8 +66,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
}
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
- refresher.getDelegate().filter(URL, content, doc, parse);
+ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {
+ refresher.getDelegate().filter(url, content, doc, parse);
}
@Override
diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
index 920ad5dea..67bcca62b 100644
--- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
+++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
@@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
}
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {}
+ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {}
@Override
public String getResourceFile() {
diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
index 13fa40fba..d8995ff87 100644
--- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
+++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
@@ -69,8 +69,8 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
}
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
- refresher.getDelegate().filter(URL, content, doc, parse);
+ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {
+ refresher.getDelegate().filter(url, content, doc, parse);
}
@Override
diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
index 920ad5dea..67bcca62b 100644
--- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
+++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
@@ -84,7 +84,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
}
@Override
- public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {}
+ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) {}
@Override
public String getResourceFile() {
diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java
index 301b106dd..420b9a0bc 100644
--- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java
+++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java
@@ -229,7 +229,7 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
// NOTE: The handler will only be called for the first url if the
// response is a redirect.
page.route(
- _url -> true,
+ lambdaUrl -> true,
route -> {
// abort if we know the main page is a redirection
if (status.get() != -1) {
diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
index fc4e5e33a..425a0a0cc 100644
--- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
+++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
@@ -45,12 +45,12 @@ void setupParserBolt() {
setupParserBolt(bolt);
}
- @Test
/**
- * Checks that recursive docs are handled correctly
+ * Checks that recursive docs are handled correctly.
*
- * @see https://issues.apache.org/jira/browse/TIKA-2096
+ * @see TIKA-2096
*/
+ @Test
void testRecursiveDoc() throws IOException {
Map conf = new HashMap<>();
conf.put("parser.extract.embedded", true);
@@ -71,12 +71,12 @@ void testRecursiveDoc() throws IOException {
.contains("Life, Liberty and the pursuit of Happiness"));
}
- @Test
/**
- * Checks that the mimetype whitelists are handled correctly
+ * Checks that the mimetype whitelists are handled correctly.
*
- * @see https://github.com/apache/stormcrawler/issues/712
+ * @see #712
*/
+ @Test
void testMimeTypeWhileList() throws IOException {
Map conf = new HashMap<>();
conf.put("parser.mimetype.whitelist", "application/.+word.*");
diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java
index 8bb1937d3..5b77a7289 100644
--- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java
+++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCHdfsBolt.java
@@ -59,8 +59,8 @@ public WARCHdfsBolt() {
withFsUrl("file:///");
}
- public WARCHdfsBolt withHeader(Map header_fields) {
- this.header_fields = header_fields;
+ public WARCHdfsBolt withHeader(Map headerFields) {
+ this.header_fields = headerFields;
return this;
}
diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java
index 0dbac9bb9..662355122 100644
--- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java
+++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCHdfsBoltTest.java
@@ -190,34 +190,24 @@ private Tuple getPage(String httpVersionString) {
String txt = "abcdef";
byte[] content = txt.getBytes(StandardCharsets.UTF_8);
Metadata metadata = new Metadata();
- metadata.addValue( //
+ metadata.addValue(
protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY,
"GET / "
+ httpVersionString
- + //
- "\r\n"
- + //
- "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n"
- + //
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
- + //
- "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n"
- + //
- "Accept-Encoding: br,gzip\r\n"
- + //
- "Host: example.org\r\n"
+ + "\r\n"
+ + "User-Agent: myBot/1.0 (https://example.org/bot/; bot@example.org)\r\n"
+ + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
+ + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n"
+ + "Accept-Encoding: br,gzip\r\n"
+ + "Host: example.org\r\n"
+ "Connection: Keep-Alive\r\n\r\n");
- metadata.addValue( //
+ metadata.addValue(
protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY,
httpVersionString
- + //
- " 200 OK\r\n"
- + //
- "Content-Type: text/html\r\n"
- + //
- "Content-Encoding: gzip\r\n"
- + //
- "Content-Length: 26\r\n"
+ + " 200 OK\r\n"
+ + "Content-Type: text/html\r\n"
+ + "Content-Encoding: gzip\r\n"
+ + "Content-Length: 26\r\n"
+ "Connection: close\r\n\r\n");
metadata.addValue(
protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY,
diff --git a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java
index c02fc8e38..6c66204d7 100644
--- a/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java
+++ b/external/warc/src/test/java/org/apache/stormcrawler/warc/WARCRecordFormatTest.java
@@ -122,15 +122,12 @@ void testReplaceHeaders() {
byte[] content = txt.getBytes(StandardCharsets.UTF_8);
String sha1str = "sha1:D6FMCDZDYW23YELHXWUEXAZ6LQCXU56S";
Metadata metadata = new Metadata();
- metadata.addValue( //
- protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, //
+ metadata.addValue(
+ protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY,
"HTTP/1.1 200 OK\r\n"
- + //
- "Content-Type: text/html\r\n"
- + //
- "Content-Encoding: gzip\r\n"
- + //
- "Content-Length: 26\r\n"
+ + "Content-Type: text/html\r\n"
+ + "Content-Encoding: gzip\r\n"
+ + "Content-Length: 26\r\n"
+ "Connection: close");
metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123");
Tuple tuple = mock(Tuple.class);
@@ -165,15 +162,12 @@ void testReplaceHttpVersion() {
String txt = "abcdef";
byte[] content = txt.getBytes(StandardCharsets.UTF_8);
Metadata metadata = new Metadata();
- metadata.addValue( //
- protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY, //
+ metadata.addValue(
+ protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY,
"HTTP/2 200 OK\r\n"
- + //
- "Content-Type: text/html\r\n"
- + //
- "Content-Encoding: gzip\r\n"
- + //
- "Content-Length: 26\r\n"
+ + "Content-Type: text/html\r\n"
+ + "Content-Encoding: gzip\r\n"
+ + "Content-Length: 26\r\n"
+ "Connection: close");
metadata.addValue(
protocolMDprefix + ProtocolResponse.PROTOCOL_VERSIONS_KEY,
@@ -208,17 +202,13 @@ void testRequestHeader() {
String txt = "abcdef";
byte[] content = txt.getBytes(StandardCharsets.UTF_8);
Metadata metadata = new Metadata();
- metadata.addValue( //
- protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY, //
+ metadata.addValue(
+ protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY,
"GET / HTTP/2\r\n"
- + //
- "User-Agent: mybot\r\n"
- + //
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
- + //
- "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n"
- + //
- "Accept-Encoding: br,gzip\r\n"
+ + "User-Agent: mybot\r\n"
+ + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
+ + "Accept-Language: en-us,en-gb,en;q=0.7,*;q=0.3\r\n"
+ + "Accept-Encoding: br,gzip\r\n"
+ "Connection: Keep-Alive\r\n\r\n");
metadata.addValue(protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY, "123.123.123.123");
Tuple tuple = mock(Tuple.class);