diff --git a/bigtop-ci/build.sh b/bigtop-ci/build.sh
index dec4e6ccb0..77d9072a10 100755
--- a/bigtop-ci/build.sh
+++ b/bigtop-ci/build.sh
@@ -109,7 +109,12 @@ fi
# Start up build container
CONTAINER_ID=`docker run -d $DOCKER_RUN_OPTION $NEXUS $IMAGE_NAME /sbin/init`
-trap "docker rm -f $CONTAINER_ID" EXIT
+trap '[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID' EXIT
+
+if [ -z "$CONTAINER_ID" ]; then
+ echo "Failed to start Docker container (e.g. permission denied). Ensure the user is in the docker group."
+ exit 1
+fi
# Copy bigtop repo into container
docker cp $BIGTOP_HOME $CONTAINER_ID:/bigtop
@@ -124,7 +129,7 @@ RESULT=$?
mkdir -p output
docker cp $CONTAINER_ID:/bigtop/build .
docker cp $CONTAINER_ID:/bigtop/output .
-docker rm -f $CONTAINER_ID
+[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID
if [ $RESULT -ne 0 ]; then
exit 1
diff --git a/bigtop-deploy/puppet/manifests/cluster.pp b/bigtop-deploy/puppet/manifests/cluster.pp
index 177551ed6e..893c7b9552 100644
--- a/bigtop-deploy/puppet/manifests/cluster.pp
+++ b/bigtop-deploy/puppet/manifests/cluster.pp
@@ -65,6 +65,9 @@
solr => {
worker => ["solr-server"],
},
+ nutch => {
+ client => ["nutch-client"],
+ },
spark => {
worker => ["spark-on-yarn"],
client => ["spark-client"],
@@ -171,6 +174,7 @@
"hadoop_zookeeper",
"hcatalog",
"livy",
+ "nutch",
"solr",
"spark",
"tez",
diff --git a/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties
new file mode 100644
index 0000000000..ba87f915dc
--- /dev/null
+++ b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=INFO,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers)
+yarn.app.container.log.dir=/tmp
+
+# Define the root logger to the system property "hadoop.root.logger".
+log4j.rootLogger=${hadoop.root.logger}, EventCounter
+
+# Logging Threshold
+log4j.threshhold=ALL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+#
+# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master).
+# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers.
+# Must be defined so rootLogger can use CLA when this file is used for container log4j.
+#
+log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender
+log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir}
+log4j.appender.CLA.containerLogFile=syslog
+log4j.appender.CLA.totalLogFileSize=10485760
+log4j.appender.CLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+hadoop.tasklog.iscleanup=false
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
+
+#=======
+# security audit logging
+
+security.audit.logger=INFO,console
+log4j.category.SecurityLogger=${security.audit.logger}
+log4j.additivity.SecurityLogger=false
+log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log
+log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
+log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd
+
+# hdfs audit logging
+
+hdfs.audit.logger=INFO,console
+log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
+log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
+log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log
+log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout
+log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd
+
+
+# mapred audit logging
+
+mapred.audit.logger=INFO,console
+log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
+log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
+log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log
+log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
+log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd
+
+
+# Mapred job summary
+
+mapred.jobsummary.logger=INFO,console
+log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger}
+log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
+log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log
+log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
+log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.JSA.DatePattern=.yyyy-MM-dd
diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp
index 164235dcec..bb704702f0 100644
--- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp
+++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp
@@ -128,6 +128,12 @@
require => [Package["hadoop"]],
}
+ # Deploy log4j.properties for YARN containers (no CLA appender; container-safe).
+ file { "/etc/hadoop/conf/log4j.properties":
+ content => template('hadoop/log4j.properties.erb'),
+ require => [Package["hadoop"]],
+ }
+
package { "hadoop":
ensure => latest,
require => Package["jdk"],
@@ -164,6 +170,8 @@
$hadoop_security_authentication = $hadoop::hadoop_security_authentication,
$kerberos_realm = $hadoop::kerberos_realm,
$yarn_nodemanager_vmem_check_enabled = undef,
+ # Ensure MR Application Master has log4j config (avoids exit code 1 in YARN containers)
+ $yarn_app_mapreduce_am_command_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
) inherits hadoop {
include hadoop::common
@@ -429,8 +437,8 @@
$mapreduce_job_reduce_slowstart_completedmaps = undef,
$mapreduce_map_memory_mb = undef,
$mapreduce_reduce_memory_mb = undef,
- $mapreduce_map_java_opts = "-Xmx1024m",
- $mapreduce_reduce_java_opts = "-Xmx1024m",
+ $mapreduce_map_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
+ $mapreduce_reduce_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
$hadoop_security_authentication = $hadoop::hadoop_security_authentication,
$kerberos_realm = $hadoop::kerberos_realm,
) inherits hadoop {
diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb
new file mode 100644
index 0000000000..4a14263e09
--- /dev/null
+++ b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Define some default values that can be overridden by system properties
+hadoop.root.logger=INFO,console
+hadoop.log.dir=.
+hadoop.log.file=hadoop.log
+
+# Root logger: only console (EventCounter/CLA not on YARN container classpath; causes AM ClassNotFoundException)
+log4j.rootLogger=${hadoop.root.logger}
+
+# Logging Threshold
+log4j.threshhold=ALL
+
+#
+# Daily Rolling File Appender
+#
+
+log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
+log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
+
+# 30-day backup
+#log4j.appender.DRFA.MaxBackupIndex=30
+log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
+
+# Pattern format: Date LogLevel LoggerName LogMessage
+log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+# Debugging Pattern format
+#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+
+# CLA (ContainerLogAppender) omitted: class not on MR AM container classpath, causes ClassNotFoundException
+
+#
+# console
+# Add "console" to rootlogger above if you want to use this
+#
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
+
+#
+# TaskLog Appender
+#
+
+#Default values
+hadoop.tasklog.taskid=null
+hadoop.tasklog.noKeepSplits=4
+hadoop.tasklog.totalLogFileSize=100
+hadoop.tasklog.purgeLogSplits=true
+hadoop.tasklog.logsRetainHours=12
+hadoop.tasklog.iscleanup=false
+
+log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
+log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
+log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
+
+log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
+
+log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
+log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+
+#
+# Event Counter Appender
+# Sends counts of logging messages at different severity levels to Hadoop Metrics.
+#
+log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
+
+#=======
+# security audit logging
+
+security.audit.logger=INFO,console
+log4j.category.SecurityLogger=${security.audit.logger}
+log4j.additivity.SecurityLogger=false
+log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log
+log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
+log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd
+
+# hdfs audit logging
+
+hdfs.audit.logger=INFO,console
+log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
+log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
+log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log
+log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout
+log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd
+
+
+# mapred audit logging
+
+mapred.audit.logger=INFO,console
+log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
+log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
+log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log
+log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
+log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd
+
+
+# Mapred job summary
+
+mapred.jobsummary.logger=INFO,console
+log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger}
+log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
+log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender
+log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log
+log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
+log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
+log4j.appender.JSA.DatePattern=.yyyy-MM-dd
diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml
index d14eb3bb2c..d5de4af74a 100644
--- a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml
+++ b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml
@@ -211,6 +211,12 @@
$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*
+
+
+ JVM options for the MapReduce Application Master (includes log4j config for container logs).
+ yarn.app.mapreduce.am.command-opts
+ <%= @yarn_app_mapreduce_am_command_opts %>
+
<% if @yarn_scheduler_minimum_allocation_mb -%>
diff --git a/bigtop-deploy/puppet/modules/nutch/manifests/init.pp b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp
new file mode 100644
index 0000000000..abd747e259
--- /dev/null
+++ b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+class nutch {
+
+ class deploy ($roles) {
+ if ("nutch-client" in $roles) {
+ include nutch::client
+ }
+ }
+
+ class client {
+ package { "nutch":
+ ensure => latest,
+ }
+
+ file { "/etc/default/nutch":
+ content => template("nutch/nutch.default"),
+ require => Package["nutch"],
+ }
+ }
+}
diff --git a/bigtop-deploy/puppet/modules/nutch/templates/nutch.default b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default
new file mode 100644
index 0000000000..44e659aee3
--- /dev/null
+++ b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Nutch installation directory (runtime/deploy for Hadoop cluster)
+export NUTCH_HOME=/usr/lib/nutch
+
+# Nutch configuration (required for deploy bin scripts)
+export NUTCH_CONF_DIR=/etc/nutch/conf.dist
+
+# Hadoop configuration (required for cluster mode)
+export HADOOP_CONF_DIR=/etc/hadoop/conf
diff --git a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties
index f75a299775..23ed3bfc87 100644
--- a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties
+++ b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties
@@ -19,7 +19,8 @@ hadoop.log.dir=.
hadoop.log.file=hadoop.log
# Define the root logger to the system property "hadoop.root.logger".
-log4j.rootLogger=${hadoop.root.logger}, EventCounter
+# Container-safe: no CLA (ContainerLogAppender) or EventCounter in rootLogger.
+log4j.rootLogger=${hadoop.root.logger}
# Logging Threshold
log4j.threshhold=ALL
@@ -41,6 +42,8 @@ log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
+# CLA (ContainerLogAppender) omitted: not on MR AM container classpath; causes errors in YARN containers.
+
#
# console
# Add "console" to rootlogger above if you want to use this
diff --git a/bigtop-packages/src/common/hadoop/install_hadoop.sh b/bigtop-packages/src/common/hadoop/install_hadoop.sh
index 966890f007..4ec5ca1a6d 100755
--- a/bigtop-packages/src/common/hadoop/install_hadoop.sh
+++ b/bigtop-packages/src/common/hadoop/install_hadoop.sh
@@ -391,6 +391,12 @@ cp ${DISTRO_DIR}/conf.empty/mapred-site.xml $PREFIX/$ETC_HADOOP/conf.empty
sed -i -e '/^[^#]/s,^,#,' ${BUILD_DIR}/etc/hadoop/hadoop-env.sh
cp -r ${BUILD_DIR}/etc/hadoop/* $PREFIX/$ETC_HADOOP/conf.empty
rm -rf $PREFIX/$ETC_HADOOP/conf.empty/*.cmd
+# Overwrite with Bigtop log4j.properties (includes CLA for YARN containers, e.g. MR AM)
+# If not found (e.g. DEB build layout), Puppet deploys it at provision time.
+BIGTOP_LOG4J="$(dirname $0)/../conf.secure/log4j.properties"
+if [ -f "$BIGTOP_LOG4J" ]; then
+ cp "$BIGTOP_LOG4J" $PREFIX/$ETC_HADOOP/conf.empty/log4j.properties
+fi
# Install default wrapper
install -d -m 0755 $PREFIX/$ETC_DEFAULT
diff --git a/bigtop-packages/src/common/nutch/do-component-build b/bigtop-packages/src/common/nutch/do-component-build
new file mode 100644
index 0000000000..f5da0fbf5b
--- /dev/null
+++ b/bigtop-packages/src/common/nutch/do-component-build
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+if [ -f "$SCRIPT_DIR/bigtop.bom" ]; then
+ . "$SCRIPT_DIR/bigtop.bom"
+fi
+
+# FULL_VERSION is set by the RPM/DEB build
+FULL_VERSION=${FULL_VERSION:-$NUTCH_VERSION}
+# HADOOP_VERSION from bigtop.bom for aligning with the stack
+BUILD_OPTS="-Dhadoop.version=${HADOOP_VERSION:-3.3.0}"
+
+ant $BUILD_OPTS clean
+ant $BUILD_OPTS runtime
diff --git a/bigtop-packages/src/common/nutch/install_nutch.sh b/bigtop-packages/src/common/nutch/install_nutch.sh
new file mode 100644
index 0000000000..608991fa68
--- /dev/null
+++ b/bigtop-packages/src/common/nutch/install_nutch.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+usage() {
+ echo "
+usage: $0
+ Required:
+ --distro-dir=DIR path to distro-specific files (SOURCES)
+ --build-dir=DIR path to Nutch source tree (contains runtime/local and runtime/deploy)
+ --prefix=PREFIX path to install into (e.g. \$RPM_BUILD_ROOT)
+
+ Optional:
+ --bin-dir=DIR path for binaries [/usr/bin]
+ --lib-dir=DIR path for Nutch home [/usr/lib/nutch]
+ --etc-default=DIR path for defaults [/etc/default]
+ --conf-dir=DIR path for config [/etc/nutch]
+ "
+ exit 1
+}
+
+OPTS=$(getopt -n $0 -o '' \
+ -l 'prefix:' -l 'distro-dir:' -l 'build-dir:' \
+ -l 'bin-dir:' -l 'lib-dir:' -l 'etc-default:' -l 'conf-dir:' -- "$@")
+[ $? = 0 ] || usage
+eval set -- "$OPTS"
+
+PREFIX= BUILD_DIR= DISTRO_DIR= BIN_DIR= LIB_DIR= ETC_DEFAULT= CONF_DIR=
+while true; do
+ case "$1" in
+ --prefix) PREFIX=$2; shift 2 ;;
+ --distro-dir) DISTRO_DIR=$2; shift 2 ;;
+ --build-dir) BUILD_DIR=$2; shift 2 ;;
+ --bin-dir) BIN_DIR=$2; shift 2 ;;
+ --lib-dir) LIB_DIR=$2; shift 2 ;;
+ --etc-default) ETC_DEFAULT=$2; shift 2 ;;
+ --conf-dir) CONF_DIR=$2; shift 2 ;;
+ --) shift; break ;;
+ *) echo "Unknown option: $1"; usage ;;
+ esac
+done
+
+for var in PREFIX BUILD_DIR DISTRO_DIR; do
+ [ -n "$(eval "echo \$$var")" ] || { echo "Missing param: $var"; usage; }
+done
+
+BIN_DIR=${BIN_DIR:-$PREFIX/usr/bin}
+LIB_DIR=${LIB_DIR:-$PREFIX/usr/lib/nutch}
+ETC_DEFAULT=${ETC_DEFAULT:-$PREFIX/etc/default}
+CONF_DIR=${CONF_DIR:-$PREFIX/etc/nutch}
+
+RUNTIME_LOCAL="$BUILD_DIR/runtime/local"
+RUNTIME_DEPLOY="$BUILD_DIR/runtime/deploy"
+[ -d "$RUNTIME_LOCAL" ] || { echo "Build dir has no runtime/local: $RUNTIME_LOCAL"; exit 1; }
+[ -d "$RUNTIME_DEPLOY" ] || { echo "Build dir has no runtime/deploy: $RUNTIME_DEPLOY"; exit 1; }
+
+# Install runtime/deploy for cluster (uber jar + bin that uses hadoop jar)
+install -d -m 0755 "$(dirname "$LIB_DIR")"
+cp -a "$RUNTIME_DEPLOY" "$LIB_DIR"
+
+# Conf from runtime/local (deploy may not include full conf)
+install -d -m 0755 "$(dirname "$CONF_DIR")"
+install -d -m 0755 "$CONF_DIR/conf.dist"
+cp -a "$RUNTIME_LOCAL/conf/"* "$CONF_DIR/conf.dist/" 2>/dev/null || true
+
+install -d -m 0755 "$ETC_DEFAULT"
+mkdir -p "$ETC_DEFAULT"
+install -m 0644 "$DISTRO_DIR/nutch.default" "$ETC_DEFAULT/nutch"
+
+# Wrapper script for /usr/bin/nutch
+install -d -m 0755 "$BIN_DIR"
+cat > "$BIN_DIR/nutch" << 'WRAPPER'
+#!/bin/bash
+# Nutch launcher - sources /etc/default/nutch and runs nutch from NUTCH_HOME
+if [ -f /etc/default/nutch ]; then
+ . /etc/default/nutch
+fi
+NUTCH_HOME=${NUTCH_HOME:-/usr/lib/nutch}
+exec "$NUTCH_HOME/bin/nutch" "$@"
+WRAPPER
+chmod 755 "$BIN_DIR/nutch"
diff --git a/bigtop-packages/src/common/nutch/nutch.default b/bigtop-packages/src/common/nutch/nutch.default
new file mode 100644
index 0000000000..6dc31e74eb
--- /dev/null
+++ b/bigtop-packages/src/common/nutch/nutch.default
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Nutch installation directory (runtime/deploy for Hadoop cluster)
+export NUTCH_HOME=/usr/lib/nutch
+
+# Nutch configuration (required for deploy bin scripts)
+export NUTCH_CONF_DIR=${NUTCH_CONF_DIR:-/etc/nutch/conf.dist}
+
+# Hadoop configuration (required for cluster mode)
+export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
diff --git a/bigtop-packages/src/deb/nutch/changelog b/bigtop-packages/src/deb/nutch/changelog
new file mode 100644
index 0000000000..97d46bf0f0
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/changelog
@@ -0,0 +1,4 @@
+nutch (1.22-1) stable; urgency=low
+ * Initial Bigtop package
+
+ -- Apache Bigtop Sat, 01 Jan 2024 00:00:00 +0000
diff --git a/bigtop-packages/src/deb/nutch/compat b/bigtop-packages/src/deb/nutch/compat
new file mode 100644
index 0000000000..f599e28b8a
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/compat
@@ -0,0 +1 @@
+10
diff --git a/bigtop-packages/src/deb/nutch/control b/bigtop-packages/src/deb/nutch/control
new file mode 100644
index 0000000000..6bbe568832
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/control
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Source: nutch
+Section: misc
+Priority: extra
+Maintainer: Apache Bigtop
+Build-Depends: debhelper (>= 7.0.50~), ant
+Standards-Version: 3.8.0
+Homepage: https://nutch.apache.org
+
+Package: nutch
+Architecture: all
+Depends: bigtop-utils (>= 0.7), hadoop-client
+Description: Apache Nutch - extensible, scalable web crawler
+ Apache Nutch is an open source web crawler. It uses Apache Hadoop data
+ structures and MapReduce for batch processing, and can integrate with
+ Apache Solr or Elasticsearch for indexing and search.
diff --git a/bigtop-packages/src/deb/nutch/copyright b/bigtop-packages/src/deb/nutch/copyright
new file mode 100644
index 0000000000..7f7a841aa9
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/copyright
@@ -0,0 +1,15 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Source: https://nutch.apache.org
+Upstream-Name: Apache Nutch
+
+Files: *
+Copyright: 2004-2024, The Apache Software Foundation
+License: Apache-2.0
+
+Files: debian/*
+Copyright: 2024, The Apache Software Foundation
+License: Apache-2.0
+
+License: Apache-2.0
+ On Debian systems, the complete text of the Apache License 2.0
+ can be found in /usr/share/common-licenses/Apache-2.0.
diff --git a/bigtop-packages/src/deb/nutch/nutch.install b/bigtop-packages/src/deb/nutch/nutch.install
new file mode 100644
index 0000000000..726994ec17
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/nutch.install
@@ -0,0 +1,4 @@
+/etc/default/nutch
+/etc/nutch
+/usr/lib/nutch
+/usr/bin/nutch
diff --git a/bigtop-packages/src/deb/nutch/rules b/bigtop-packages/src/deb/nutch/rules
new file mode 100644
index 0000000000..8337aab283
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/rules
@@ -0,0 +1,37 @@
+#!/usr/bin/make -f
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -*- makefile -*-
+
+export DH_VERBOSE=1
+export DH_OPTIONS
+
+%:
+ dh $@
+
+override_dh_auto_build:
+ env FULL_VERSION=${NUTCH_BASE_VERSION} bash debian/do-component-build
+
+override_dh_auto_install:
+ sh debian/install_nutch.sh \
+ --build-dir=. \
+ --distro-dir=debian \
+ --prefix=debian/tmp \
+ --bin-dir=debian/tmp/usr/bin \
+ --lib-dir=debian/tmp/usr/lib/nutch \
+ --etc-default=debian/tmp/etc/default \
+ --conf-dir=debian/tmp/etc/nutch
diff --git a/bigtop-packages/src/deb/nutch/source/format b/bigtop-packages/src/deb/nutch/source/format
new file mode 100644
index 0000000000..163aaf8d82
--- /dev/null
+++ b/bigtop-packages/src/deb/nutch/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec
new file mode 100644
index 0000000000..db406c7576
--- /dev/null
+++ b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+%define nutch_name nutch
+%define nutch_pkg_name %{nutch_name}%{?pkg_name_suffix:%{pkg_name_suffix}}
+%define hadoop_pkg_name hadoop%{?pkg_name_suffix:%{pkg_name_suffix}}
+
+%define etc_default %{?parent_dir:/%{parent_dir}}/etc/default
+%define usr_lib_nutch %{?parent_dir:/%{parent_dir}}/usr/lib/%{nutch_name}
+%define etc_nutch %{?parent_dir:/%{parent_dir}}/etc/%{nutch_name}
+%define bin_dir %{?parent_dir:/%{parent_dir}}%{_bindir}
+
+%define __os_install_post %{nil}
+
+Name: %{nutch_pkg_name}
+Version: %{nutch_version}
+Release: %{nutch_release}
+Summary: Apache Nutch - extensible, scalable web crawler
+URL: https://nutch.apache.org
+Group: Development/Libraries
+BuildArch: noarch
+Buildroot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
+License: ASL 2.0
+
+Source0: apache-nutch-%{nutch_base_version}-src.tar.gz
+Source1: do-component-build
+Source2: install_nutch.sh
+Source3: nutch.default
+
+Requires: bigtop-utils >= 0.7
+Requires: %{hadoop_pkg_name}-client
+
+%if %{?suse_version:1}0
+%else
+BuildRequires: ant
+%endif
+
+%description
+Apache Nutch is an open source web crawler. It uses Apache Hadoop data
+structures and MapReduce for batch processing, and can integrate with
+Apache Solr or Elasticsearch for indexing and search.
+
+%prep
+%setup -q -n apache-nutch-%{nutch_base_version}
+
+%build
+env FULL_VERSION=%{nutch_base_version} HADOOP_VERSION=%{hadoop_version} bash %{SOURCE1}
+
+%install
+%__rm -rf $RPM_BUILD_ROOT
+sh %{SOURCE2} \
+ --build-dir=%{_builddir}/apache-nutch-%{nutch_base_version} \
+ --prefix=$RPM_BUILD_ROOT \
+ --distro-dir=$RPM_SOURCE_DIR \
+ --bin-dir=$RPM_BUILD_ROOT%{bin_dir} \
+ --lib-dir=$RPM_BUILD_ROOT%{usr_lib_nutch} \
+ --etc-default=$RPM_BUILD_ROOT%{etc_default} \
+ --conf-dir=$RPM_BUILD_ROOT%{etc_nutch}
+
+%files
+%defattr(-,root,root,-)
+%config(noreplace) %{etc_default}/nutch
+%dir %{etc_nutch}
+%{etc_nutch}/conf.dist
+%{usr_lib_nutch}
+%{bin_dir}/nutch
diff --git a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy
new file mode 100644
index 0000000000..fc7a2f0653
--- /dev/null
+++ b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * All tests run Nutch on the Hadoop cluster using HDFS (runtime/deploy).
+ */
+
+package org.apache.bigtop.itest.nutch
+
+import org.apache.bigtop.itest.shell.Shell
+import org.junit.BeforeClass
+import org.junit.AfterClass
+import org.junit.Test
+import org.junit.FixMethodOrder
+import org.junit.runners.MethodSorters
+
+import static org.junit.Assert.assertTrue
+import static org.junit.Assert.assertNotNull
+
+@FixMethodOrder(MethodSorters.JVM)
+class TestNutchSmoke {
+ static Shell sh = new Shell("/bin/bash -s")
+
+ static final String NUTCH_CMD = "/usr/bin/nutch"
+ static final String HDFS_BASE = "/user/root/nutch-smoke"
+
+ @BeforeClass
+ static void setUp() {
+ sh.exec("hadoop fs -mkdir -p ${HDFS_BASE}/urls")
+ assertTrue("hadoop fs mkdir failed: " + sh.getErr(), sh.getRet() == 0)
+ sh.exec("echo 'https://bigtop.apache.org' | hadoop fs -put - ${HDFS_BASE}/urls/seed.txt")
+ assertTrue("hadoop fs put seed failed: " + sh.getErr(), sh.getRet() == 0)
+ }
+
+ @AfterClass
+ static void tearDown() {
+ sh.exec("hadoop fs -rm -r -f ${HDFS_BASE} 2>/dev/null || true")
+ }
+
+ @Test(timeout = 15000L)
+ void testNutchUsage() {
+ sh.exec("${NUTCH_CMD} showproperties")
+ assertTrue("nutch showproperties failed: " + sh.getErr(), sh.getRet() == 0)
+ String out = sh.getOut().toString() + " " + sh.getErr().toString()
+ assertTrue("nutch showproperties should print config (got: " + out + ")", out.contains("=") || out.contains("nutch"))
+ }
+
+ @Test
+ void testNutchInjectSubcommand() {
+ sh.exec("${NUTCH_CMD} inject")
+ assertTrue("nutch inject without args should fail with non-zero exit", sh.getRet() != 0)
+ String out = (sh.getOut().toString() + " " + sh.getErr().toString()).toLowerCase()
+ assertTrue("nutch inject should print usage or error (got: " + out + ")", out.contains("inject") || out.contains("usage") || out.contains("argument"))
+ }
+
+ @Test
+ void testNutchInjectAndReaddb() {
+ sh.exec("${NUTCH_CMD} inject ${HDFS_BASE}/crawldb ${HDFS_BASE}/urls")
+ assertTrue("nutch inject (HDFS) failed: " + sh.getErr(), sh.getRet() == 0)
+
+ sh.exec("${NUTCH_CMD} readdb ${HDFS_BASE}/crawldb -stats")
+ assertTrue("nutch readdb -stats (HDFS) failed: " + sh.getErr(), sh.getRet() == 0)
+ String statsOut = sh.getOut().toString() + " " + sh.getErr().toString()
+ assertTrue("readdb output should show url/crawldb stats (got: " + statsOut + ")", statsOut.contains("url") || statsOut.contains("Number") || statsOut.contains("count") || statsOut.contains("1"))
+ }
+
+ @Test
+ void testNutchGenerate() {
+ sh.exec("${NUTCH_CMD} generate ${HDFS_BASE}/crawldb ${HDFS_BASE}/segments -topN 1")
+ assertTrue("nutch generate (HDFS) failed: " + sh.getErr(), sh.getRet() == 0)
+
+ sh.exec("hadoop fs -ls ${HDFS_BASE}/segments")
+ assertTrue("generate should create at least one segment under ${HDFS_BASE}/segments: " + sh.getErr(), sh.getRet() == 0 && (sh.getOut().toString() + sh.getErr().toString()).trim().length() > 0)
+ }
+}
diff --git a/bigtop-tests/smoke-tests/nutch/build.gradle b/bigtop-tests/smoke-tests/nutch/build.gradle
new file mode 100644
index 0000000000..9d8852dbd2
--- /dev/null
+++ b/bigtop-tests/smoke-tests/nutch/build.gradle
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+def tests_to_include() {
+ return [
+ "TestNutchSmoke.groovy"
+ ];
+}
+
+sourceSets {
+ test {
+ groovy {
+ srcDirs = ["${BIGTOP_HOME}/bigtop-tests/smoke-tests/nutch/"]
+ exclude { FileTreeElement elem -> (doExclude(elem.getName())) }
+ }
+ }
+}
+
+test.doFirst {
+ checkEnv(["JAVA_HOME", "HADOOP_CONF_DIR"])
+}
+
+test {
+ include "**/Test*"
+}
diff --git a/bigtop.bom b/bigtop.bom
index 0a958d9cdc..b122aab8cc 100644
--- a/bigtop.bom
+++ b/bigtop.bom
@@ -124,7 +124,7 @@ bigtop {
zookeeper:['hadoop', 'hbase', 'kafka'],
hadoop:['hbase', 'hive', 'tez',
'solr', 'spark', 'ranger',
- 'phoenix', 'alluxio', 'zeppelin'
+ 'phoenix', 'alluxio', 'zeppelin', 'nutch'
],
hbase:['phoenix', 'hive'],
hive:['zeppelin'],
@@ -212,6 +212,18 @@ bigtop {
site = "${apache.APACHE_MIRROR}/${download_path}"
archive = "${apache.APACHE_ARCHIVE}/${download_path}" }
}
+ 'nutch' {
+ name = 'nutch'
+ pkg = name
+ rpm_pkg_suffix = "_" + bigtop.base_version.replace(".", "_")
+ relNotes = 'Apache Nutch'
+ version { base = '1.22'; pkg = base; release = 1 }
+ tarball { source = "apache-nutch-${version.base}-src.tar.gz"
+ destination = source }
+ url { download_path = "/nutch/${version.base}"
+ site = "${apache.APACHE_MIRROR}/${download_path}"
+ archive = "${apache.APACHE_ARCHIVE}/${download_path}" }
+ }
'spark' {
name = 'spark'
pkg = 'spark'
diff --git a/bigtop_toolchain/manifests/packages.pp b/bigtop_toolchain/manifests/packages.pp
index 3aec81bbd8..b3d666c5a5 100644
--- a/bigtop_toolchain/manifests/packages.pp
+++ b/bigtop_toolchain/manifests/packages.pp
@@ -260,6 +260,7 @@
"dh-make",
"libfuse2",
"libjansi-java",
+ "ant",
"libxml2-dev",
"libxslt1-dev",
"zlib1g-dev",
diff --git a/packages.gradle b/packages.gradle
index e3c855329b..0e9c24466d 100644
--- a/packages.gradle
+++ b/packages.gradle
@@ -386,26 +386,35 @@ def genTasks = { target, packaging ->
def final DEB_PKG_NAME_SUFFIX = getPkgNameSuffix(config.bigtop.components[target], "deb").pkgNameSuffix
def final BIGTOP_BASE_VERSION = "${config.bigtop.base_version}"
+ delete "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION"
exec {
workingDir PKG_OUTPUT_DIR
commandLine "dpkg-source -x $SRCDEB".split(' ')
}
// Order of debuild parameters is important; hence specifying explicitely rather
// than in an array of args
+ def debArch = 'amd64'
+ try {
+ def proc = 'dpkg --print-architecture'.execute()
+ proc.waitFor()
+ if (proc.exitValue() == 0 && proc.text?.trim()) debArch = proc.text.trim()
+ } catch (Exception ignore) {}
+ def debuildDepsOpt = (target == 'nutch') ? ' -d' : ''
+ def nutchJava11Env = (target == 'nutch') ? "--set-envvar=JAVA_HOME=/usr/lib/jvm/java-11-openjdk-${debArch} " : ''
def command = """debuild \
--preserve-envvar PATH \
--preserve-envvar MAVEN3_HOME \
--preserve-envvar MAVEN_OPTS \
--preserve-envvar JAVA_HOME \
--preserve-envvar BIGTOP_JDK \
---set-envvar=HADOOP_VERSION=$HADOOP_VERSION \
+${nutchJava11Env}--set-envvar=HADOOP_VERSION=$HADOOP_VERSION \
--set-envvar=${toOldStyleName(target)}_BASE_VERSION=$BASE_VERSION \
--set-envvar=${toOldStyleName(target)}_VERSION=$PKG_VERSION \
--set-envvar=${toOldStyleName(target)}_RELEASE=$BIGTOP_BUILD_STAMP \
--set-envvar=PARENT_DIR=$FULL_PARENT_DIR \
--set-envvar=PKG_NAME_SUFFIX=$DEB_PKG_NAME_SUFFIX \
--set-envvar=bigtop_base_version=${BIGTOP_BASE_VERSION} \
--uc -us -b
+-uc -us -b${debuildDepsOpt}
"""
exec {
workingDir "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION"
@@ -533,13 +542,14 @@ def genTasks = { target, packaging ->
}
// Deleting obsolete files
delete fileTree(dir: "$DEB_BLD_DIR/debian", includes: ['*.ex', '*.EX', '*.~'])
- // Creating source package
+ // Creating source package (-d for nutch: ant is provided by toolchain at /usr/local/ant, not by apt)
+ def dpkgBuildpackageArgs = (target == 'nutch') ? ['-uc', '-us', '-sa', '-S', '-d'] : ['-uc', '-us', '-sa', '-S']
exec {
workingDir DEB_BLD_DIR
environment 'PARENT_DIR', FULL_PARENT_DIR
environment 'PKG_NAME_SUFFIX', DEB_PKG_NAME_SUFFIX
environment 'bigtop_base_version', BIGTOP_BASE_VERSION
- commandLine "dpkg-buildpackage -uc -us -sa -S".split(' ')
+ commandLine (['dpkg-buildpackage'] + dpkgBuildpackageArgs)
}
mkdir(PKG_OUTPUT_DIR)
copy {
@@ -978,6 +988,7 @@ task "apt" (
description: "Creating APT repository",
group: PACKAGES_GROUP) doLast {
+ file(BUILD_DIR).mkdirs()
delete ( "$OUTPUT_DIR/apt")
mkdir ("$OUTPUT_DIR/apt/conf")
@@ -988,7 +999,7 @@ task "apt" (
}.each { changeFile ->
exec {
workingDir BUILD_DIR
- commandLine "reprepro -Vb $OUTPUT_DIR/apt include bigtop $changeFile".split(' ')
+ commandLine "/usr/bin/reprepro", "-Vb", "$OUTPUT_DIR/apt", "include", "bigtop", changeFile.absolutePath
}
}
}
diff --git a/provisioner/docker/config_ubuntu-24.04.yaml b/provisioner/docker/config_ubuntu-24.04.yaml
index 5478178648..21a577fe9b 100644
--- a/provisioner/docker/config_ubuntu-24.04.yaml
+++ b/provisioner/docker/config_ubuntu-24.04.yaml
@@ -19,6 +19,6 @@ docker:
repo: "http://repos.bigtop.apache.org/releases/3.5.0/ubuntu/24.04/$(ARCH)"
distro: debian
-components: [hdfs, yarn, mapreduce]
+components: [hdfs, yarn, mapreduce, nutch]
enable_local_repo: false
-smoke_test_components: [hdfs, yarn, mapreduce]
+smoke_test_components: [hdfs, yarn, mapreduce, nutch]
diff --git a/provisioner/docker/docker-hadoop.sh b/provisioner/docker/docker-hadoop.sh
index 38ece152d0..94a5ad6da6 100755
--- a/provisioner/docker/docker-hadoop.sh
+++ b/provisioner/docker/docker-hadoop.sh
@@ -169,6 +169,7 @@ bigtop::hadoop_head_node: $1
hadoop::hadoop_storage_dirs: [/data/1, /data/2]
bigtop::bigtop_repo_uri: $2
bigtop::bigtop_repo_gpg_check: $gpg_check
+hadoop::hadoop_java_home: "/usr/lib/jvm/java-11-openjdk-amd64"
hadoop_cluster_node::cluster_components: $3
hadoop_cluster_node::cluster_nodes: [$node_list]
hadoop::common_yarn::yarn_resourcemanager_scheduler_class: org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler
diff --git a/provisioner/utils/setup-env-debian.sh b/provisioner/utils/setup-env-debian.sh
index ccfee9ef8d..a2c8c1ba48 100755
--- a/provisioner/utils/setup-env-debian.sh
+++ b/provisioner/utils/setup-env-debian.sh
@@ -27,6 +27,19 @@ service rng-tools start
# The testing process would be broken due to "No such file or derictory: /etc/default/locale" in ubuntu16.04.
apt-get install -y locales
+# Enable universe on Ubuntu so openjdk-11-jdk is available (e.g. for Nutch 1.22)
+# Use same Signed-By as main Ubuntu sources to avoid "Conflicting values set for option Signed-By"
+if command -v lsb_release >/dev/null 2>&1 && [ "$(lsb_release -is 2>/dev/null)" = "Ubuntu" ]; then
+ release=$(lsb_release -sc 2>/dev/null)
+ if [ -n "$release" ] && [ ! -f /etc/apt/sources.list.d/universe.list ]; then
+ echo "deb [signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://archive.ubuntu.com/ubuntu $release universe" > /etc/apt/sources.list.d/universe.list
+ apt-get update
+ fi
+fi
+
+# OpenJDK 11 for components that require it (e.g. Nutch 1.22, class file version 55.0)
+apt-get install -y openjdk-11-jdk
+
if [ $enable_local_repo == "true" ]; then
echo "deb file:///bigtop-home/output/apt bigtop contrib" > /etc/apt/sources.list.d/bigtop-home_output.list
# In BIGTOP-2796 repo installed by puppet has priority 900, here we set higher priority for local repo
diff --git a/provisioner/utils/smoke-tests.sh b/provisioner/utils/smoke-tests.sh
index 8a04ad3248..0791fa4c29 100755
--- a/provisioner/utils/smoke-tests.sh
+++ b/provisioner/utils/smoke-tests.sh
@@ -22,6 +22,11 @@ if [ -z "$SMOKE_TESTS" ]; then
exit 2
fi
+# Prefer Java 11 when available (required for Nutch 1.22 and other components with class file 55.0)
+if [ -d /usr/lib/jvm/java-11-openjdk-amd64 ]; then
+ export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+fi
+
# Autodetect JAVA_HOME
if [ -e /usr/lib/bigtop-utils/bigtop-detect-javahome ]; then
. /usr/lib/bigtop-utils/bigtop-detect-javahome