diff --git a/bigtop-ci/build.sh b/bigtop-ci/build.sh index dec4e6ccb0..77d9072a10 100755 --- a/bigtop-ci/build.sh +++ b/bigtop-ci/build.sh @@ -109,7 +109,12 @@ fi # Start up build container CONTAINER_ID=`docker run -d $DOCKER_RUN_OPTION $NEXUS $IMAGE_NAME /sbin/init` -trap "docker rm -f $CONTAINER_ID" EXIT +trap '[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID' EXIT + +if [ -z "$CONTAINER_ID" ]; then + echo "Failed to start Docker container (e.g. permission denied). Ensure the user is in the docker group." + exit 1 +fi # Copy bigtop repo into container docker cp $BIGTOP_HOME $CONTAINER_ID:/bigtop @@ -124,7 +129,7 @@ RESULT=$? mkdir -p output docker cp $CONTAINER_ID:/bigtop/build . docker cp $CONTAINER_ID:/bigtop/output . -docker rm -f $CONTAINER_ID +[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID if [ $RESULT -ne 0 ]; then exit 1 diff --git a/bigtop-deploy/puppet/manifests/cluster.pp b/bigtop-deploy/puppet/manifests/cluster.pp index 177551ed6e..893c7b9552 100644 --- a/bigtop-deploy/puppet/manifests/cluster.pp +++ b/bigtop-deploy/puppet/manifests/cluster.pp @@ -65,6 +65,9 @@ solr => { worker => ["solr-server"], }, + nutch => { + client => ["nutch-client"], + }, spark => { worker => ["spark-on-yarn"], client => ["spark-client"], @@ -171,6 +174,7 @@ "hadoop_zookeeper", "hcatalog", "livy", + "nutch", "solr", "spark", "tez", diff --git a/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties new file mode 100644 index 0000000000..ba87f915dc --- /dev/null +++ b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log +# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) +yarn.app.container.log.dir=/tmp + +# Define the root logger to the system property "hadoop.root.logger". +log4j.rootLogger=${hadoop.root.logger}, EventCounter + +# Logging Threshold +log4j.threshhold=ALL + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). +# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. +# Must be defined so rootLogger can use CLA when this file is used for container log4j. +# +log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender +log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} +log4j.appender.CLA.containerLogFile=syslog +log4j.appender.CLA.totalLogFileSize=10485760 +log4j.appender.CLA.layout=org.apache.log4j.PatternLayout +log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# +# TaskLog Appender +# + +#Default values +hadoop.tasklog.taskid=null +hadoop.tasklog.noKeepSplits=4 +hadoop.tasklog.totalLogFileSize=100 +hadoop.tasklog.purgeLogSplits=true +hadoop.tasklog.logsRetainHours=12 +hadoop.tasklog.iscleanup=false + +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender +log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} +log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} + +log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup} + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter + +#======= +# security audit logging + +security.audit.logger=INFO,console +log4j.category.SecurityLogger=${security.audit.logger} +log4j.additivity.SecurityLogger=false +log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log +log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd + +# hdfs audit logging + +hdfs.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} +log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false +log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log +log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd + + +# mapred audit logging + +mapred.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger} +log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false +log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log +log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd + + +# Mapred job summary + +mapred.jobsummary.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger} +log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false +log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log +log4j.appender.JSA.layout=org.apache.log4j.PatternLayout +log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.JSA.DatePattern=.yyyy-MM-dd diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp index 164235dcec..bb704702f0 100644 --- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp +++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp @@ -128,6 +128,12 @@ require => [Package["hadoop"]], } + # Deploy log4j.properties for YARN containers (no CLA appender; container-safe). + file { "/etc/hadoop/conf/log4j.properties": + content => template('hadoop/log4j.properties.erb'), + require => [Package["hadoop"]], + } + package { "hadoop": ensure => latest, require => Package["jdk"], @@ -164,6 +170,8 @@ $hadoop_security_authentication = $hadoop::hadoop_security_authentication, $kerberos_realm = $hadoop::kerberos_realm, $yarn_nodemanager_vmem_check_enabled = undef, + # Ensure MR Application Master has log4j config (avoids exit code 1 in YARN containers) + $yarn_app_mapreduce_am_command_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", ) inherits hadoop { include hadoop::common @@ -429,8 +437,8 @@ $mapreduce_job_reduce_slowstart_completedmaps = undef, $mapreduce_map_memory_mb = undef, $mapreduce_reduce_memory_mb = undef, - $mapreduce_map_java_opts = "-Xmx1024m", - $mapreduce_reduce_java_opts = "-Xmx1024m", + $mapreduce_map_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", + $mapreduce_reduce_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", $hadoop_security_authentication = $hadoop::hadoop_security_authentication, $kerberos_realm = $hadoop::kerberos_realm, ) inherits hadoop { diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb new file mode 100644 index 0000000000..4a14263e09 --- /dev/null +++ b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log + +# Root logger: only console (EventCounter/CLA not on YARN container classpath; causes AM ClassNotFoundException) +log4j.rootLogger=${hadoop.root.logger} + +# Logging Threshold +log4j.threshhold=ALL + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# CLA (ContainerLogAppender) omitted: class not on MR AM container classpath, causes ClassNotFoundException + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# +# TaskLog Appender +# + +#Default values +hadoop.tasklog.taskid=null +hadoop.tasklog.noKeepSplits=4 +hadoop.tasklog.totalLogFileSize=100 +hadoop.tasklog.purgeLogSplits=true +hadoop.tasklog.logsRetainHours=12 +hadoop.tasklog.iscleanup=false + +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender +log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} +log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} + +log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup} + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter + +#======= +# security audit logging + +security.audit.logger=INFO,console +log4j.category.SecurityLogger=${security.audit.logger} +log4j.additivity.SecurityLogger=false +log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log +log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd + +# hdfs audit logging + +hdfs.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} +log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false +log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log +log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd + + +# mapred audit logging + +mapred.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger} +log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false +log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log +log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd + + +# Mapred job summary + +mapred.jobsummary.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger} +log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false +log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log +log4j.appender.JSA.layout=org.apache.log4j.PatternLayout +log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.JSA.DatePattern=.yyyy-MM-dd diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml index d14eb3bb2c..d5de4af74a 100644 --- a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml +++ b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml @@ -211,6 +211,12 @@ $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/* + + + JVM options for the MapReduce Application Master (includes log4j config for container logs). + yarn.app.mapreduce.am.command-opts + <%= @yarn_app_mapreduce_am_command_opts %> + <% if @yarn_scheduler_minimum_allocation_mb -%> diff --git a/bigtop-deploy/puppet/modules/nutch/manifests/init.pp b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp new file mode 100644 index 0000000000..abd747e259 --- /dev/null +++ b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class nutch { + + class deploy ($roles) { + if ("nutch-client" in $roles) { + include nutch::client + } + } + + class client { + package { "nutch": + ensure => latest, + } + + file { "/etc/default/nutch": + content => template("nutch/nutch.default"), + require => Package["nutch"], + } + } +} diff --git a/bigtop-deploy/puppet/modules/nutch/templates/nutch.default b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default new file mode 100644 index 0000000000..44e659aee3 --- /dev/null +++ b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Nutch installation directory (runtime/deploy for Hadoop cluster) +export NUTCH_HOME=/usr/lib/nutch + +# Nutch configuration (required for deploy bin scripts) +export NUTCH_CONF_DIR=/etc/nutch/conf.dist + +# Hadoop configuration (required for cluster mode) +export HADOOP_CONF_DIR=/etc/hadoop/conf diff --git a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties index f75a299775..23ed3bfc87 100644 --- a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties +++ b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties @@ -19,7 +19,8 @@ hadoop.log.dir=. hadoop.log.file=hadoop.log # Define the root logger to the system property "hadoop.root.logger". -log4j.rootLogger=${hadoop.root.logger}, EventCounter +# Container-safe: no CLA (ContainerLogAppender) or EventCounter in rootLogger. +log4j.rootLogger=${hadoop.root.logger} # Logging Threshold log4j.threshhold=ALL @@ -41,6 +42,8 @@ log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n # Debugging Pattern format #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n +# CLA (ContainerLogAppender) omitted: not on MR AM container classpath; causes errors in YARN containers. + # # console # Add "console" to rootlogger above if you want to use this diff --git a/bigtop-packages/src/common/hadoop/install_hadoop.sh b/bigtop-packages/src/common/hadoop/install_hadoop.sh index 966890f007..4ec5ca1a6d 100755 --- a/bigtop-packages/src/common/hadoop/install_hadoop.sh +++ b/bigtop-packages/src/common/hadoop/install_hadoop.sh @@ -391,6 +391,12 @@ cp ${DISTRO_DIR}/conf.empty/mapred-site.xml $PREFIX/$ETC_HADOOP/conf.empty sed -i -e '/^[^#]/s,^,#,' ${BUILD_DIR}/etc/hadoop/hadoop-env.sh cp -r ${BUILD_DIR}/etc/hadoop/* $PREFIX/$ETC_HADOOP/conf.empty rm -rf $PREFIX/$ETC_HADOOP/conf.empty/*.cmd +# Overwrite with Bigtop log4j.properties (includes CLA for YARN containers, e.g. MR AM) +# If not found (e.g. DEB build layout), Puppet deploys it at provision time. +BIGTOP_LOG4J="$(dirname $0)/../conf.secure/log4j.properties" +if [ -f "$BIGTOP_LOG4J" ]; then + cp "$BIGTOP_LOG4J" $PREFIX/$ETC_HADOOP/conf.empty/log4j.properties +fi # Install default wrapper install -d -m 0755 $PREFIX/$ETC_DEFAULT diff --git a/bigtop-packages/src/common/nutch/do-component-build b/bigtop-packages/src/common/nutch/do-component-build new file mode 100644 index 0000000000..f5da0fbf5b --- /dev/null +++ b/bigtop-packages/src/common/nutch/do-component-build @@ -0,0 +1,30 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +if [ -f "$SCRIPT_DIR/bigtop.bom" ]; then + . "$SCRIPT_DIR/bigtop.bom" +fi + +# FULL_VERSION is set by the RPM/DEB build +FULL_VERSION=${FULL_VERSION:-$NUTCH_VERSION} +# HADOOP_VERSION from bigtop.bom for aligning with the stack +BUILD_OPTS="-Dhadoop.version=${HADOOP_VERSION:-3.3.0}" + +ant $BUILD_OPTS clean +ant $BUILD_OPTS runtime diff --git a/bigtop-packages/src/common/nutch/install_nutch.sh b/bigtop-packages/src/common/nutch/install_nutch.sh new file mode 100644 index 0000000000..608991fa68 --- /dev/null +++ b/bigtop-packages/src/common/nutch/install_nutch.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +usage() { + echo " +usage: $0 + Required: + --distro-dir=DIR path to distro-specific files (SOURCES) + --build-dir=DIR path to Nutch source tree (contains runtime/local and runtime/deploy) + --prefix=PREFIX path to install into (e.g. \$RPM_BUILD_ROOT) + + Optional: + --bin-dir=DIR path for binaries [/usr/bin] + --lib-dir=DIR path for Nutch home [/usr/lib/nutch] + --etc-default=DIR path for defaults [/etc/default] + --conf-dir=DIR path for config [/etc/nutch] + " + exit 1 +} + +OPTS=$(getopt -n $0 -o '' \ + -l 'prefix:' -l 'distro-dir:' -l 'build-dir:' \ + -l 'bin-dir:' -l 'lib-dir:' -l 'etc-default:' -l 'conf-dir:' -- "$@") +[ $? = 0 ] || usage +eval set -- "$OPTS" + +PREFIX= BUILD_DIR= DISTRO_DIR= BIN_DIR= LIB_DIR= ETC_DEFAULT= CONF_DIR= +while true; do + case "$1" in + --prefix) PREFIX=$2; shift 2 ;; + --distro-dir) DISTRO_DIR=$2; shift 2 ;; + --build-dir) BUILD_DIR=$2; shift 2 ;; + --bin-dir) BIN_DIR=$2; shift 2 ;; + --lib-dir) LIB_DIR=$2; shift 2 ;; + --etc-default) ETC_DEFAULT=$2; shift 2 ;; + --conf-dir) CONF_DIR=$2; shift 2 ;; + --) shift; break ;; + *) echo "Unknown option: $1"; usage ;; + esac +done + +for var in PREFIX BUILD_DIR DISTRO_DIR; do + [ -n "$(eval "echo \$$var")" ] || { echo "Missing param: $var"; usage; } +done + +BIN_DIR=${BIN_DIR:-$PREFIX/usr/bin} +LIB_DIR=${LIB_DIR:-$PREFIX/usr/lib/nutch} +ETC_DEFAULT=${ETC_DEFAULT:-$PREFIX/etc/default} +CONF_DIR=${CONF_DIR:-$PREFIX/etc/nutch} + +RUNTIME_LOCAL="$BUILD_DIR/runtime/local" +RUNTIME_DEPLOY="$BUILD_DIR/runtime/deploy" +[ -d "$RUNTIME_LOCAL" ] || { echo "Build dir has no runtime/local: $RUNTIME_LOCAL"; exit 1; } +[ -d "$RUNTIME_DEPLOY" ] || { echo "Build dir has no runtime/deploy: $RUNTIME_DEPLOY"; exit 1; } + +# Install runtime/deploy for cluster (uber jar + bin that uses hadoop jar) +install -d -m 0755 "$(dirname "$LIB_DIR")" +cp -a "$RUNTIME_DEPLOY" "$LIB_DIR" + +# Conf from runtime/local (deploy may not include full conf) +install -d -m 0755 "$(dirname "$CONF_DIR")" +install -d -m 0755 "$CONF_DIR/conf.dist" +cp -a "$RUNTIME_LOCAL/conf/"* "$CONF_DIR/conf.dist/" 2>/dev/null || true + +install -d -m 0755 "$ETC_DEFAULT" +mkdir -p "$ETC_DEFAULT" +install -m 0644 "$DISTRO_DIR/nutch.default" "$ETC_DEFAULT/nutch" + +# Wrapper script for /usr/bin/nutch +install -d -m 0755 "$BIN_DIR" +cat > "$BIN_DIR/nutch" << 'WRAPPER' +#!/bin/bash +# Nutch launcher - sources /etc/default/nutch and runs nutch from NUTCH_HOME +if [ -f /etc/default/nutch ]; then + . /etc/default/nutch +fi +NUTCH_HOME=${NUTCH_HOME:-/usr/lib/nutch} +exec "$NUTCH_HOME/bin/nutch" "$@" +WRAPPER +chmod 755 "$BIN_DIR/nutch" diff --git a/bigtop-packages/src/common/nutch/nutch.default b/bigtop-packages/src/common/nutch/nutch.default new file mode 100644 index 0000000000..6dc31e74eb --- /dev/null +++ b/bigtop-packages/src/common/nutch/nutch.default @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Nutch installation directory (runtime/deploy for Hadoop cluster) +export NUTCH_HOME=/usr/lib/nutch + +# Nutch configuration (required for deploy bin scripts) +export NUTCH_CONF_DIR=${NUTCH_CONF_DIR:-/etc/nutch/conf.dist} + +# Hadoop configuration (required for cluster mode) +export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} diff --git a/bigtop-packages/src/deb/nutch/changelog b/bigtop-packages/src/deb/nutch/changelog new file mode 100644 index 0000000000..97d46bf0f0 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/changelog @@ -0,0 +1,4 @@ +nutch (1.22-1) stable; urgency=low + * Initial Bigtop package + + -- Apache Bigtop Sat, 01 Jan 2024 00:00:00 +0000 diff --git a/bigtop-packages/src/deb/nutch/compat b/bigtop-packages/src/deb/nutch/compat new file mode 100644 index 0000000000..f599e28b8a --- /dev/null +++ b/bigtop-packages/src/deb/nutch/compat @@ -0,0 +1 @@ +10 diff --git a/bigtop-packages/src/deb/nutch/control b/bigtop-packages/src/deb/nutch/control new file mode 100644 index 0000000000..6bbe568832 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/control @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Source: nutch +Section: misc +Priority: extra +Maintainer: Apache Bigtop +Build-Depends: debhelper (>= 7.0.50~), ant +Standards-Version: 3.8.0 +Homepage: https://nutch.apache.org + +Package: nutch +Architecture: all +Depends: bigtop-utils (>= 0.7), hadoop-client +Description: Apache Nutch - extensible, scalable web crawler + Apache Nutch is an open source web crawler. It uses Apache Hadoop data + structures and MapReduce for batch processing, and can integrate with + Apache Solr or Elasticsearch for indexing and search. diff --git a/bigtop-packages/src/deb/nutch/copyright b/bigtop-packages/src/deb/nutch/copyright new file mode 100644 index 0000000000..7f7a841aa9 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/copyright @@ -0,0 +1,15 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Source: https://nutch.apache.org +Upstream-Name: Apache Nutch + +Files: * +Copyright: 2004-2024, The Apache Software Foundation +License: Apache-2.0 + +Files: debian/* +Copyright: 2024, The Apache Software Foundation +License: Apache-2.0 + +License: Apache-2.0 + On Debian systems, the complete text of the Apache License 2.0 + can be found in /usr/share/common-licenses/Apache-2.0. diff --git a/bigtop-packages/src/deb/nutch/nutch.install b/bigtop-packages/src/deb/nutch/nutch.install new file mode 100644 index 0000000000..726994ec17 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/nutch.install @@ -0,0 +1,4 @@ +/etc/default/nutch +/etc/nutch +/usr/lib/nutch +/usr/bin/nutch diff --git a/bigtop-packages/src/deb/nutch/rules b/bigtop-packages/src/deb/nutch/rules new file mode 100644 index 0000000000..8337aab283 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/rules @@ -0,0 +1,37 @@ +#!/usr/bin/make -f +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- makefile -*- + +export DH_VERBOSE=1 +export DH_OPTIONS + +%: + dh $@ + +override_dh_auto_build: + env FULL_VERSION=${NUTCH_BASE_VERSION} bash debian/do-component-build + +override_dh_auto_install: + sh debian/install_nutch.sh \ + --build-dir=. \ + --distro-dir=debian \ + --prefix=debian/tmp \ + --bin-dir=debian/tmp/usr/bin \ + --lib-dir=debian/tmp/usr/lib/nutch \ + --etc-default=debian/tmp/etc/default \ + --conf-dir=debian/tmp/etc/nutch diff --git a/bigtop-packages/src/deb/nutch/source/format b/bigtop-packages/src/deb/nutch/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec new file mode 100644 index 0000000000..db406c7576 --- /dev/null +++ b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +%define nutch_name nutch +%define nutch_pkg_name %{nutch_name}%{?pkg_name_suffix:%{pkg_name_suffix}} +%define hadoop_pkg_name hadoop%{?pkg_name_suffix:%{pkg_name_suffix}} + +%define etc_default %{?parent_dir:/%{parent_dir}}/etc/default +%define usr_lib_nutch %{?parent_dir:/%{parent_dir}}/usr/lib/%{nutch_name} +%define etc_nutch %{?parent_dir:/%{parent_dir}}/etc/%{nutch_name} +%define bin_dir %{?parent_dir:/%{parent_dir}}%{_bindir} + +%define __os_install_post %{nil} + +Name: %{nutch_pkg_name} +Version: %{nutch_version} +Release: %{nutch_release} +Summary: Apache Nutch - extensible, scalable web crawler +URL: https://nutch.apache.org +Group: Development/Libraries +BuildArch: noarch +Buildroot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +License: ASL 2.0 + +Source0: apache-nutch-%{nutch_base_version}-src.tar.gz +Source1: do-component-build +Source2: install_nutch.sh +Source3: nutch.default + +Requires: bigtop-utils >= 0.7 +Requires: %{hadoop_pkg_name}-client + +%if %{?suse_version:1}0 +%else +BuildRequires: ant +%endif + +%description +Apache Nutch is an open source web crawler. It uses Apache Hadoop data +structures and MapReduce for batch processing, and can integrate with +Apache Solr or Elasticsearch for indexing and search. + +%prep +%setup -q -n apache-nutch-%{nutch_base_version} + +%build +env FULL_VERSION=%{nutch_base_version} HADOOP_VERSION=%{hadoop_version} bash %{SOURCE1} + +%install +%__rm -rf $RPM_BUILD_ROOT +sh %{SOURCE2} \ + --build-dir=%{_builddir}/apache-nutch-%{nutch_base_version} \ + --prefix=$RPM_BUILD_ROOT \ + --distro-dir=$RPM_SOURCE_DIR \ + --bin-dir=$RPM_BUILD_ROOT%{bin_dir} \ + --lib-dir=$RPM_BUILD_ROOT%{usr_lib_nutch} \ + --etc-default=$RPM_BUILD_ROOT%{etc_default} \ + --conf-dir=$RPM_BUILD_ROOT%{etc_nutch} + +%files +%defattr(-,root,root,-) +%config(noreplace) %{etc_default}/nutch +%dir %{etc_nutch} +%{etc_nutch}/conf.dist +%{usr_lib_nutch} +%{bin_dir}/nutch diff --git a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy new file mode 100644 index 0000000000..fc7a2f0653 --- /dev/null +++ b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * All tests run Nutch on the Hadoop cluster using HDFS (runtime/deploy). + */ + +package org.apache.bigtop.itest.nutch + +import org.apache.bigtop.itest.shell.Shell +import org.junit.BeforeClass +import org.junit.AfterClass +import org.junit.Test +import org.junit.FixMethodOrder +import org.junit.runners.MethodSorters + +import static org.junit.Assert.assertTrue +import static org.junit.Assert.assertNotNull + +@FixMethodOrder(MethodSorters.JVM) +class TestNutchSmoke { + static Shell sh = new Shell("/bin/bash -s") + + static final String NUTCH_CMD = "/usr/bin/nutch" + static final String HDFS_BASE = "/user/root/nutch-smoke" + + @BeforeClass + static void setUp() { + sh.exec("hadoop fs -mkdir -p ${HDFS_BASE}/urls") + assertTrue("hadoop fs mkdir failed: " + sh.getErr(), sh.getRet() == 0) + sh.exec("echo 'https://bigtop.apache.org' | hadoop fs -put - ${HDFS_BASE}/urls/seed.txt") + assertTrue("hadoop fs put seed failed: " + sh.getErr(), sh.getRet() == 0) + } + + @AfterClass + static void tearDown() { + sh.exec("hadoop fs -rm -r -f ${HDFS_BASE} 2>/dev/null || true") + } + + @Test(timeout = 15000L) + void testNutchUsage() { + sh.exec("${NUTCH_CMD} showproperties") + assertTrue("nutch showproperties failed: " + sh.getErr(), sh.getRet() == 0) + String out = sh.getOut().toString() + " " + sh.getErr().toString() + assertTrue("nutch showproperties should print config (got: " + out + ")", out.contains("=") || out.contains("nutch")) + } + + @Test + void testNutchInjectSubcommand() { + sh.exec("${NUTCH_CMD} inject") + assertTrue("nutch inject without args should fail with non-zero exit", sh.getRet() != 0) + String out = (sh.getOut().toString() + " " + sh.getErr().toString()).toLowerCase() + assertTrue("nutch inject should print usage or error (got: " + out + ")", out.contains("inject") || out.contains("usage") || out.contains("argument")) + } + + @Test + void testNutchInjectAndReaddb() { + sh.exec("${NUTCH_CMD} inject ${HDFS_BASE}/crawldb ${HDFS_BASE}/urls") + assertTrue("nutch inject (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + + sh.exec("${NUTCH_CMD} readdb ${HDFS_BASE}/crawldb -stats") + assertTrue("nutch readdb -stats (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + String statsOut = sh.getOut().toString() + " " + sh.getErr().toString() + assertTrue("readdb output should show url/crawldb stats (got: " + statsOut + ")", statsOut.contains("url") || statsOut.contains("Number") || statsOut.contains("count") || statsOut.contains("1")) + } + + @Test + void testNutchGenerate() { + sh.exec("${NUTCH_CMD} generate ${HDFS_BASE}/crawldb ${HDFS_BASE}/segments -topN 1") + assertTrue("nutch generate (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + + sh.exec("hadoop fs -ls ${HDFS_BASE}/segments") + assertTrue("generate should create at least one segment under ${HDFS_BASE}/segments: " + sh.getErr(), sh.getRet() == 0 && (sh.getOut().toString() + sh.getErr().toString()).trim().length() > 0) + } +} diff --git a/bigtop-tests/smoke-tests/nutch/build.gradle b/bigtop-tests/smoke-tests/nutch/build.gradle new file mode 100644 index 0000000000..9d8852dbd2 --- /dev/null +++ b/bigtop-tests/smoke-tests/nutch/build.gradle @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +def tests_to_include() { + return [ + "TestNutchSmoke.groovy" + ]; +} + +sourceSets { + test { + groovy { + srcDirs = ["${BIGTOP_HOME}/bigtop-tests/smoke-tests/nutch/"] + exclude { FileTreeElement elem -> (doExclude(elem.getName())) } + } + } +} + +test.doFirst { + checkEnv(["JAVA_HOME", "HADOOP_CONF_DIR"]) +} + +test { + include "**/Test*" +} diff --git a/bigtop.bom b/bigtop.bom index 0a958d9cdc..b122aab8cc 100644 --- a/bigtop.bom +++ b/bigtop.bom @@ -124,7 +124,7 @@ bigtop { zookeeper:['hadoop', 'hbase', 'kafka'], hadoop:['hbase', 'hive', 'tez', 'solr', 'spark', 'ranger', - 'phoenix', 'alluxio', 'zeppelin' + 'phoenix', 'alluxio', 'zeppelin', 'nutch' ], hbase:['phoenix', 'hive'], hive:['zeppelin'], @@ -212,6 +212,18 @@ bigtop { site = "${apache.APACHE_MIRROR}/${download_path}" archive = "${apache.APACHE_ARCHIVE}/${download_path}" } } + 'nutch' { + name = 'nutch' + pkg = name + rpm_pkg_suffix = "_" + bigtop.base_version.replace(".", "_") + relNotes = 'Apache Nutch' + version { base = '1.22'; pkg = base; release = 1 } + tarball { source = "apache-nutch-${version.base}-src.tar.gz" + destination = source } + url { download_path = "/nutch/${version.base}" + site = "${apache.APACHE_MIRROR}/${download_path}" + archive = "${apache.APACHE_ARCHIVE}/${download_path}" } + } 'spark' { name = 'spark' pkg = 'spark' diff --git a/bigtop_toolchain/manifests/packages.pp b/bigtop_toolchain/manifests/packages.pp index 3aec81bbd8..b3d666c5a5 100644 --- a/bigtop_toolchain/manifests/packages.pp +++ b/bigtop_toolchain/manifests/packages.pp @@ -260,6 +260,7 @@ "dh-make", "libfuse2", "libjansi-java", + "ant", "libxml2-dev", "libxslt1-dev", "zlib1g-dev", diff --git a/packages.gradle b/packages.gradle index e3c855329b..0e9c24466d 100644 --- a/packages.gradle +++ b/packages.gradle @@ -386,26 +386,35 @@ def genTasks = { target, packaging -> def final DEB_PKG_NAME_SUFFIX = getPkgNameSuffix(config.bigtop.components[target], "deb").pkgNameSuffix def final BIGTOP_BASE_VERSION = "${config.bigtop.base_version}" + delete "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION" exec { workingDir PKG_OUTPUT_DIR commandLine "dpkg-source -x $SRCDEB".split(' ') } // Order of debuild parameters is important; hence specifying explicitely rather // than in an array of args + def debArch = 'amd64' + try { + def proc = 'dpkg --print-architecture'.execute() + proc.waitFor() + if (proc.exitValue() == 0 && proc.text?.trim()) debArch = proc.text.trim() + } catch (Exception ignore) {} + def debuildDepsOpt = (target == 'nutch') ? ' -d' : '' + def nutchJava11Env = (target == 'nutch') ? "--set-envvar=JAVA_HOME=/usr/lib/jvm/java-11-openjdk-${debArch} " : '' def command = """debuild \ --preserve-envvar PATH \ --preserve-envvar MAVEN3_HOME \ --preserve-envvar MAVEN_OPTS \ --preserve-envvar JAVA_HOME \ --preserve-envvar BIGTOP_JDK \ ---set-envvar=HADOOP_VERSION=$HADOOP_VERSION \ +${nutchJava11Env}--set-envvar=HADOOP_VERSION=$HADOOP_VERSION \ --set-envvar=${toOldStyleName(target)}_BASE_VERSION=$BASE_VERSION \ --set-envvar=${toOldStyleName(target)}_VERSION=$PKG_VERSION \ --set-envvar=${toOldStyleName(target)}_RELEASE=$BIGTOP_BUILD_STAMP \ --set-envvar=PARENT_DIR=$FULL_PARENT_DIR \ --set-envvar=PKG_NAME_SUFFIX=$DEB_PKG_NAME_SUFFIX \ --set-envvar=bigtop_base_version=${BIGTOP_BASE_VERSION} \ --uc -us -b +-uc -us -b${debuildDepsOpt} """ exec { workingDir "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION" @@ -533,13 +542,14 @@ def genTasks = { target, packaging -> } // Deleting obsolete files delete fileTree(dir: "$DEB_BLD_DIR/debian", includes: ['*.ex', '*.EX', '*.~']) - // Creating source package + // Creating source package (-d for nutch: ant is provided by toolchain at /usr/local/ant, not by apt) + def dpkgBuildpackageArgs = (target == 'nutch') ? ['-uc', '-us', '-sa', '-S', '-d'] : ['-uc', '-us', '-sa', '-S'] exec { workingDir DEB_BLD_DIR environment 'PARENT_DIR', FULL_PARENT_DIR environment 'PKG_NAME_SUFFIX', DEB_PKG_NAME_SUFFIX environment 'bigtop_base_version', BIGTOP_BASE_VERSION - commandLine "dpkg-buildpackage -uc -us -sa -S".split(' ') + commandLine (['dpkg-buildpackage'] + dpkgBuildpackageArgs) } mkdir(PKG_OUTPUT_DIR) copy { @@ -978,6 +988,7 @@ task "apt" ( description: "Creating APT repository", group: PACKAGES_GROUP) doLast { + file(BUILD_DIR).mkdirs() delete ( "$OUTPUT_DIR/apt") mkdir ("$OUTPUT_DIR/apt/conf") @@ -988,7 +999,7 @@ task "apt" ( }.each { changeFile -> exec { workingDir BUILD_DIR - commandLine "reprepro -Vb $OUTPUT_DIR/apt include bigtop $changeFile".split(' ') + commandLine "/usr/bin/reprepro", "-Vb", "$OUTPUT_DIR/apt", "include", "bigtop", changeFile.absolutePath } } } diff --git a/provisioner/docker/config_ubuntu-24.04.yaml b/provisioner/docker/config_ubuntu-24.04.yaml index 5478178648..21a577fe9b 100644 --- a/provisioner/docker/config_ubuntu-24.04.yaml +++ b/provisioner/docker/config_ubuntu-24.04.yaml @@ -19,6 +19,6 @@ docker: repo: "http://repos.bigtop.apache.org/releases/3.5.0/ubuntu/24.04/$(ARCH)" distro: debian -components: [hdfs, yarn, mapreduce] +components: [hdfs, yarn, mapreduce, nutch] enable_local_repo: false -smoke_test_components: [hdfs, yarn, mapreduce] +smoke_test_components: [hdfs, yarn, mapreduce, nutch] diff --git a/provisioner/docker/docker-hadoop.sh b/provisioner/docker/docker-hadoop.sh index 38ece152d0..94a5ad6da6 100755 --- a/provisioner/docker/docker-hadoop.sh +++ b/provisioner/docker/docker-hadoop.sh @@ -169,6 +169,7 @@ bigtop::hadoop_head_node: $1 hadoop::hadoop_storage_dirs: [/data/1, /data/2] bigtop::bigtop_repo_uri: $2 bigtop::bigtop_repo_gpg_check: $gpg_check +hadoop::hadoop_java_home: "/usr/lib/jvm/java-11-openjdk-amd64" hadoop_cluster_node::cluster_components: $3 hadoop_cluster_node::cluster_nodes: [$node_list] hadoop::common_yarn::yarn_resourcemanager_scheduler_class: org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler diff --git a/provisioner/utils/setup-env-debian.sh b/provisioner/utils/setup-env-debian.sh index ccfee9ef8d..a2c8c1ba48 100755 --- a/provisioner/utils/setup-env-debian.sh +++ b/provisioner/utils/setup-env-debian.sh @@ -27,6 +27,19 @@ service rng-tools start # The testing process would be broken due to "No such file or derictory: /etc/default/locale" in ubuntu16.04. apt-get install -y locales +# Enable universe on Ubuntu so openjdk-11-jdk is available (e.g. for Nutch 1.22) +# Use same Signed-By as main Ubuntu sources to avoid "Conflicting values set for option Signed-By" +if command -v lsb_release >/dev/null 2>&1 && [ "$(lsb_release -is 2>/dev/null)" = "Ubuntu" ]; then + release=$(lsb_release -sc 2>/dev/null) + if [ -n "$release" ] && [ ! -f /etc/apt/sources.list.d/universe.list ]; then + echo "deb [signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://archive.ubuntu.com/ubuntu $release universe" > /etc/apt/sources.list.d/universe.list + apt-get update + fi +fi + +# OpenJDK 11 for components that require it (e.g. Nutch 1.22, class file version 55.0) +apt-get install -y openjdk-11-jdk + if [ $enable_local_repo == "true" ]; then echo "deb file:///bigtop-home/output/apt bigtop contrib" > /etc/apt/sources.list.d/bigtop-home_output.list # In BIGTOP-2796 repo installed by puppet has priority 900, here we set higher priority for local repo diff --git a/provisioner/utils/smoke-tests.sh b/provisioner/utils/smoke-tests.sh index 8a04ad3248..0791fa4c29 100755 --- a/provisioner/utils/smoke-tests.sh +++ b/provisioner/utils/smoke-tests.sh @@ -22,6 +22,11 @@ if [ -z "$SMOKE_TESTS" ]; then exit 2 fi +# Prefer Java 11 when available (required for Nutch 1.22 and other components with class file 55.0) +if [ -d /usr/lib/jvm/java-11-openjdk-amd64 ]; then + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +fi + # Autodetect JAVA_HOME if [ -e /usr/lib/bigtop-utils/bigtop-detect-javahome ]; then . /usr/lib/bigtop-utils/bigtop-detect-javahome