From 09f386ee85fe75f65d43af6c40cd668ea8a7f4b8 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 15:24:34 -0800 Subject: [PATCH 01/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-deploy/puppet/manifests/cluster.pp | 4 + .../puppet/modules/nutch/manifests/init.pp | 34 +++++++ .../modules/nutch/templates/nutch.default | 23 +++++ .../src/common/nutch/do-component-build | 30 ++++++ .../src/common/nutch/install_nutch.sh | 94 +++++++++++++++++++ .../src/common/nutch/nutch.default | 23 +++++ bigtop-packages/src/deb/nutch/changelog | 4 + bigtop-packages/src/deb/nutch/control | 30 ++++++ bigtop-packages/src/deb/nutch/copyright | 15 +++ bigtop-packages/src/deb/nutch/nutch.install | 4 + bigtop-packages/src/deb/nutch/rules | 37 ++++++++ bigtop-packages/src/deb/nutch/source/format | 1 + .../src/rpm/nutch/SPECS/nutch.spec | 78 +++++++++++++++ .../smoke-tests/nutch/TestNutchSmoke.groovy | 86 +++++++++++++++++ bigtop-tests/smoke-tests/nutch/build.gradle | 40 ++++++++ bigtop.bom | 14 ++- provisioner/docker/config_ubuntu-24.04.yaml | 4 +- 17 files changed, 518 insertions(+), 3 deletions(-) create mode 100644 bigtop-deploy/puppet/modules/nutch/manifests/init.pp create mode 100644 bigtop-deploy/puppet/modules/nutch/templates/nutch.default create mode 100644 bigtop-packages/src/common/nutch/do-component-build create mode 100644 bigtop-packages/src/common/nutch/install_nutch.sh create mode 100644 bigtop-packages/src/common/nutch/nutch.default create mode 100644 bigtop-packages/src/deb/nutch/changelog create mode 100644 bigtop-packages/src/deb/nutch/control create mode 100644 bigtop-packages/src/deb/nutch/copyright create mode 100644 bigtop-packages/src/deb/nutch/nutch.install create mode 100644 bigtop-packages/src/deb/nutch/rules create mode 100644 bigtop-packages/src/deb/nutch/source/format create mode 100644 bigtop-packages/src/rpm/nutch/SPECS/nutch.spec create mode 100644 bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy create mode 100644 bigtop-tests/smoke-tests/nutch/build.gradle diff --git a/bigtop-deploy/puppet/manifests/cluster.pp b/bigtop-deploy/puppet/manifests/cluster.pp index 177551ed6e..893c7b9552 100644 --- a/bigtop-deploy/puppet/manifests/cluster.pp +++ b/bigtop-deploy/puppet/manifests/cluster.pp @@ -65,6 +65,9 @@ solr => { worker => ["solr-server"], }, + nutch => { + client => ["nutch-client"], + }, spark => { worker => ["spark-on-yarn"], client => ["spark-client"], @@ -171,6 +174,7 @@ "hadoop_zookeeper", "hcatalog", "livy", + "nutch", "solr", "spark", "tez", diff --git a/bigtop-deploy/puppet/modules/nutch/manifests/init.pp b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp new file mode 100644 index 0000000000..abd747e259 --- /dev/null +++ b/bigtop-deploy/puppet/modules/nutch/manifests/init.pp @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class nutch { + + class deploy ($roles) { + if ("nutch-client" in $roles) { + include nutch::client + } + } + + class client { + package { "nutch": + ensure => latest, + } + + file { "/etc/default/nutch": + content => template("nutch/nutch.default"), + require => Package["nutch"], + } + } +} diff --git a/bigtop-deploy/puppet/modules/nutch/templates/nutch.default b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default new file mode 100644 index 0000000000..44e659aee3 --- /dev/null +++ b/bigtop-deploy/puppet/modules/nutch/templates/nutch.default @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Nutch installation directory (runtime/deploy for Hadoop cluster) +export NUTCH_HOME=/usr/lib/nutch + +# Nutch configuration (required for deploy bin scripts) +export NUTCH_CONF_DIR=/etc/nutch/conf.dist + +# Hadoop configuration (required for cluster mode) +export HADOOP_CONF_DIR=/etc/hadoop/conf diff --git a/bigtop-packages/src/common/nutch/do-component-build b/bigtop-packages/src/common/nutch/do-component-build new file mode 100644 index 0000000000..f5da0fbf5b --- /dev/null +++ b/bigtop-packages/src/common/nutch/do-component-build @@ -0,0 +1,30 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -ex + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +if [ -f "$SCRIPT_DIR/bigtop.bom" ]; then + . "$SCRIPT_DIR/bigtop.bom" +fi + +# FULL_VERSION is set by the RPM/DEB build +FULL_VERSION=${FULL_VERSION:-$NUTCH_VERSION} +# HADOOP_VERSION from bigtop.bom for aligning with the stack +BUILD_OPTS="-Dhadoop.version=${HADOOP_VERSION:-3.3.0}" + +ant $BUILD_OPTS clean +ant $BUILD_OPTS runtime diff --git a/bigtop-packages/src/common/nutch/install_nutch.sh b/bigtop-packages/src/common/nutch/install_nutch.sh new file mode 100644 index 0000000000..8de99be096 --- /dev/null +++ b/bigtop-packages/src/common/nutch/install_nutch.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +usage() { + echo " +usage: $0 + Required: + --distro-dir=DIR path to distro-specific files (SOURCES) + --build-dir=DIR path to Nutch source tree (contains runtime/local and runtime/deploy) + --prefix=PREFIX path to install into (e.g. \$RPM_BUILD_ROOT) + + Optional: + --bin-dir=DIR path for binaries [/usr/bin] + --lib-dir=DIR path for Nutch home [/usr/lib/nutch] + --etc-default=DIR path for defaults [/etc/default] + --conf-dir=DIR path for config [/etc/nutch] + " + exit 1 +} + +OPTS=$(getopt -n $0 -o '' \ + -l 'prefix:' -l 'distro-dir:' -l 'build-dir:' \ + -l 'bin-dir:' -l 'lib-dir:' -l 'etc-default:' -l 'conf-dir:' -- "$@") +[ $? = 0 ] || usage +eval set -- "$OPTS" + +PREFIX= BUILD_DIR= DISTRO_DIR= BIN_DIR= LIB_DIR= ETC_DEFAULT= CONF_DIR= +while true; do + case "$1" in + --prefix) PREFIX=$2; shift 2 ;; + --distro-dir) DISTRO_DIR=$2; shift 2 ;; + --build-dir) BUILD_DIR=$2; shift 2 ;; + --bin-dir) BIN_DIR=$2; shift 2 ;; + --lib-dir) LIB_DIR=$2; shift 2 ;; + --etc-default) ETC_DEFAULT=$2; shift 2 ;; + --conf-dir) CONF_DIR=$2; shift 2 ;; + --) shift; break ;; + *) echo "Unknown option: $1"; usage ;; + esac +done + +for var in PREFIX BUILD_DIR DISTRO_DIR; do + [ -n "$(eval "echo \$$var")" ] || { echo "Missing param: $var"; usage; } +done + +BIN_DIR=${BIN_DIR:-$PREFIX/usr/bin} +LIB_DIR=${LIB_DIR:-$PREFIX/usr/lib/nutch} +ETC_DEFAULT=${ETC_DEFAULT:-$PREFIX/etc/default} +CONF_DIR=${CONF_DIR:-$PREFIX/etc/nutch} + +RUNTIME_LOCAL="$BUILD_DIR/runtime/local" +RUNTIME_DEPLOY="$BUILD_DIR/runtime/deploy" +[ -d "$RUNTIME_LOCAL" ] || { echo "Build dir has no runtime/local: $RUNTIME_LOCAL"; exit 1; } +[ -d "$RUNTIME_DEPLOY" ] || { echo "Build dir has no runtime/deploy: $RUNTIME_DEPLOY"; exit 1; } + +# Install runtime/deploy for cluster (uber jar + bin that uses hadoop jar) +install -d -m 0755 "$(dirname "$LIB_DIR")" +cp -a "$RUNTIME_DEPLOY" "$LIB_DIR" + +# Conf from runtime/local (deploy may not include full conf) +install -d -m 0755 "$(dirname "$CONF_DIR")" +install -d -m 0755 "$CONF_DIR/conf.dist" +cp -a "$RUNTIME_LOCAL/conf/"* "$CONF_DIR/conf.dist/" 2>/dev/null || true + +install -d -m 0755 "$(dirname "$ETC_DEFAULT")" +install -m 0644 "$DISTRO_DIR/nutch.default" "$ETC_DEFAULT/nutch" + +# Wrapper script for /usr/bin/nutch +install -d -m 0755 "$BIN_DIR" +cat > "$BIN_DIR/nutch" << 'WRAPPER' +#!/bin/bash +# Nutch launcher - sources /etc/default/nutch and runs nutch from NUTCH_HOME +if [ -f /etc/default/nutch ]; then + . /etc/default/nutch +fi +NUTCH_HOME=${NUTCH_HOME:-/usr/lib/nutch} +exec "$NUTCH_HOME/bin/nutch" "$@" +WRAPPER +chmod 755 "$BIN_DIR/nutch" diff --git a/bigtop-packages/src/common/nutch/nutch.default b/bigtop-packages/src/common/nutch/nutch.default new file mode 100644 index 0000000000..6dc31e74eb --- /dev/null +++ b/bigtop-packages/src/common/nutch/nutch.default @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Nutch installation directory (runtime/deploy for Hadoop cluster) +export NUTCH_HOME=/usr/lib/nutch + +# Nutch configuration (required for deploy bin scripts) +export NUTCH_CONF_DIR=${NUTCH_CONF_DIR:-/etc/nutch/conf.dist} + +# Hadoop configuration (required for cluster mode) +export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} diff --git a/bigtop-packages/src/deb/nutch/changelog b/bigtop-packages/src/deb/nutch/changelog new file mode 100644 index 0000000000..97d46bf0f0 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/changelog @@ -0,0 +1,4 @@ +nutch (1.22-1) stable; urgency=low + * Initial Bigtop package + + -- Apache Bigtop Sat, 01 Jan 2024 00:00:00 +0000 diff --git a/bigtop-packages/src/deb/nutch/control b/bigtop-packages/src/deb/nutch/control new file mode 100644 index 0000000000..6bbe568832 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/control @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Source: nutch +Section: misc +Priority: extra +Maintainer: Apache Bigtop +Build-Depends: debhelper (>= 7.0.50~), ant +Standards-Version: 3.8.0 +Homepage: https://nutch.apache.org + +Package: nutch +Architecture: all +Depends: bigtop-utils (>= 0.7), hadoop-client +Description: Apache Nutch - extensible, scalable web crawler + Apache Nutch is an open source web crawler. It uses Apache Hadoop data + structures and MapReduce for batch processing, and can integrate with + Apache Solr or Elasticsearch for indexing and search. diff --git a/bigtop-packages/src/deb/nutch/copyright b/bigtop-packages/src/deb/nutch/copyright new file mode 100644 index 0000000000..7f7a841aa9 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/copyright @@ -0,0 +1,15 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Source: https://nutch.apache.org +Upstream-Name: Apache Nutch + +Files: * +Copyright: 2004-2024, The Apache Software Foundation +License: Apache-2.0 + +Files: debian/* +Copyright: 2024, The Apache Software Foundation +License: Apache-2.0 + +License: Apache-2.0 + On Debian systems, the complete text of the Apache License 2.0 + can be found in /usr/share/common-licenses/Apache-2.0. diff --git a/bigtop-packages/src/deb/nutch/nutch.install b/bigtop-packages/src/deb/nutch/nutch.install new file mode 100644 index 0000000000..726994ec17 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/nutch.install @@ -0,0 +1,4 @@ +/etc/default/nutch +/etc/nutch +/usr/lib/nutch +/usr/bin/nutch diff --git a/bigtop-packages/src/deb/nutch/rules b/bigtop-packages/src/deb/nutch/rules new file mode 100644 index 0000000000..8337aab283 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/rules @@ -0,0 +1,37 @@ +#!/usr/bin/make -f +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# -*- makefile -*- + +export DH_VERBOSE=1 +export DH_OPTIONS + +%: + dh $@ + +override_dh_auto_build: + env FULL_VERSION=${NUTCH_BASE_VERSION} bash debian/do-component-build + +override_dh_auto_install: + sh debian/install_nutch.sh \ + --build-dir=. \ + --distro-dir=debian \ + --prefix=debian/tmp \ + --bin-dir=debian/tmp/usr/bin \ + --lib-dir=debian/tmp/usr/lib/nutch \ + --etc-default=debian/tmp/etc/default \ + --conf-dir=debian/tmp/etc/nutch diff --git a/bigtop-packages/src/deb/nutch/source/format b/bigtop-packages/src/deb/nutch/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/bigtop-packages/src/deb/nutch/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec new file mode 100644 index 0000000000..db406c7576 --- /dev/null +++ b/bigtop-packages/src/rpm/nutch/SPECS/nutch.spec @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +%define nutch_name nutch +%define nutch_pkg_name %{nutch_name}%{?pkg_name_suffix:%{pkg_name_suffix}} +%define hadoop_pkg_name hadoop%{?pkg_name_suffix:%{pkg_name_suffix}} + +%define etc_default %{?parent_dir:/%{parent_dir}}/etc/default +%define usr_lib_nutch %{?parent_dir:/%{parent_dir}}/usr/lib/%{nutch_name} +%define etc_nutch %{?parent_dir:/%{parent_dir}}/etc/%{nutch_name} +%define bin_dir %{?parent_dir:/%{parent_dir}}%{_bindir} + +%define __os_install_post %{nil} + +Name: %{nutch_pkg_name} +Version: %{nutch_version} +Release: %{nutch_release} +Summary: Apache Nutch - extensible, scalable web crawler +URL: https://nutch.apache.org +Group: Development/Libraries +BuildArch: noarch +Buildroot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +License: ASL 2.0 + +Source0: apache-nutch-%{nutch_base_version}-src.tar.gz +Source1: do-component-build +Source2: install_nutch.sh +Source3: nutch.default + +Requires: bigtop-utils >= 0.7 +Requires: %{hadoop_pkg_name}-client + +%if %{?suse_version:1}0 +%else +BuildRequires: ant +%endif + +%description +Apache Nutch is an open source web crawler. It uses Apache Hadoop data +structures and MapReduce for batch processing, and can integrate with +Apache Solr or Elasticsearch for indexing and search. + +%prep +%setup -q -n apache-nutch-%{nutch_base_version} + +%build +env FULL_VERSION=%{nutch_base_version} HADOOP_VERSION=%{hadoop_version} bash %{SOURCE1} + +%install +%__rm -rf $RPM_BUILD_ROOT +sh %{SOURCE2} \ + --build-dir=%{_builddir}/apache-nutch-%{nutch_base_version} \ + --prefix=$RPM_BUILD_ROOT \ + --distro-dir=$RPM_SOURCE_DIR \ + --bin-dir=$RPM_BUILD_ROOT%{bin_dir} \ + --lib-dir=$RPM_BUILD_ROOT%{usr_lib_nutch} \ + --etc-default=$RPM_BUILD_ROOT%{etc_default} \ + --conf-dir=$RPM_BUILD_ROOT%{etc_nutch} + +%files +%defattr(-,root,root,-) +%config(noreplace) %{etc_default}/nutch +%dir %{etc_nutch} +%{etc_nutch}/conf.dist +%{usr_lib_nutch} +%{bin_dir}/nutch diff --git a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy new file mode 100644 index 0000000000..c91ebfbc4c --- /dev/null +++ b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * All tests run Nutch on the Hadoop cluster using HDFS (runtime/deploy). + */ + +package org.apache.bigtop.itest.nutch + +import org.apache.bigtop.itest.shell.Shell +import org.junit.BeforeClass +import org.junit.AfterClass +import org.junit.Test +import org.junit.FixMethodOrder +import org.junit.runners.MethodSorters + +import static org.junit.Assert.assertTrue +import static org.junit.Assert.assertNotNull + +@FixMethodOrder(MethodSorters.JVM) +class TestNutchSmoke { + static Shell sh = new Shell("/bin/bash -s") + + static final String NUTCH_CMD = "/usr/bin/nutch" + static final String HDFS_BASE = "/user/root/nutch-smoke" + + @BeforeClass + static void setUp() { + sh.exec("hadoop fs -mkdir -p ${HDFS_BASE}/urls") + assertTrue("hadoop fs mkdir failed: " + sh.getErr(), sh.getRet() == 0) + sh.exec("echo 'https://bigtop.apache.org' | hadoop fs -put - ${HDFS_BASE}/urls/seed.txt") + assertTrue("hadoop fs put seed failed: " + sh.getErr(), sh.getRet() == 0) + } + + @AfterClass + static void tearDown() { + sh.exec("hadoop fs -rm -r -f ${HDFS_BASE} 2>/dev/null || true") + } + + @Test + void testNutchUsage() { + sh.exec(NUTCH_CMD) + assertTrue("nutch usage failed: " + sh.getErr(), sh.getRet() == 0) + } + + @Test + void testNutchInjectSubcommand() { + sh.exec("${NUTCH_CMD} inject") + assertTrue("nutch inject without args should fail with non-zero exit", sh.getRet() != 0) + String out = (sh.getOut() + " " + sh.getErr()).toLowerCase() + assertTrue("nutch inject should print usage or error (got: " + out + ")", out.contains("inject") || out.contains("usage") || out.contains("argument")) + } + + @Test + void testNutchInjectAndReaddb() { + sh.exec("${NUTCH_CMD} inject ${HDFS_BASE}/crawldb ${HDFS_BASE}/urls") + assertTrue("nutch inject (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + + sh.exec("${NUTCH_CMD} readdb ${HDFS_BASE}/crawldb -stats") + assertTrue("nutch readdb -stats (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + String statsOut = sh.getOut() + " " + sh.getErr() + assertTrue("readdb output should show url/crawldb stats (got: " + statsOut + ")", statsOut.contains("url") || statsOut.contains("Number") || statsOut.contains("count") || statsOut.contains("1")) + } + + @Test + void testNutchGenerate() { + sh.exec("${NUTCH_CMD} generate ${HDFS_BASE}/crawldb ${HDFS_BASE}/segments -topN 1") + assertTrue("nutch generate (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) + + sh.exec("hadoop fs -ls ${HDFS_BASE}/segments") + assertTrue("generate should create at least one segment under ${HDFS_BASE}/segments: " + sh.getErr(), sh.getRet() == 0 && (sh.getOut() + sh.getErr()).trim().length() > 0) + } +} diff --git a/bigtop-tests/smoke-tests/nutch/build.gradle b/bigtop-tests/smoke-tests/nutch/build.gradle new file mode 100644 index 0000000000..9d8852dbd2 --- /dev/null +++ b/bigtop-tests/smoke-tests/nutch/build.gradle @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +def tests_to_include() { + return [ + "TestNutchSmoke.groovy" + ]; +} + +sourceSets { + test { + groovy { + srcDirs = ["${BIGTOP_HOME}/bigtop-tests/smoke-tests/nutch/"] + exclude { FileTreeElement elem -> (doExclude(elem.getName())) } + } + } +} + +test.doFirst { + checkEnv(["JAVA_HOME", "HADOOP_CONF_DIR"]) +} + +test { + include "**/Test*" +} diff --git a/bigtop.bom b/bigtop.bom index 0a958d9cdc..b122aab8cc 100644 --- a/bigtop.bom +++ b/bigtop.bom @@ -124,7 +124,7 @@ bigtop { zookeeper:['hadoop', 'hbase', 'kafka'], hadoop:['hbase', 'hive', 'tez', 'solr', 'spark', 'ranger', - 'phoenix', 'alluxio', 'zeppelin' + 'phoenix', 'alluxio', 'zeppelin', 'nutch' ], hbase:['phoenix', 'hive'], hive:['zeppelin'], @@ -212,6 +212,18 @@ bigtop { site = "${apache.APACHE_MIRROR}/${download_path}" archive = "${apache.APACHE_ARCHIVE}/${download_path}" } } + 'nutch' { + name = 'nutch' + pkg = name + rpm_pkg_suffix = "_" + bigtop.base_version.replace(".", "_") + relNotes = 'Apache Nutch' + version { base = '1.22'; pkg = base; release = 1 } + tarball { source = "apache-nutch-${version.base}-src.tar.gz" + destination = source } + url { download_path = "/nutch/${version.base}" + site = "${apache.APACHE_MIRROR}/${download_path}" + archive = "${apache.APACHE_ARCHIVE}/${download_path}" } + } 'spark' { name = 'spark' pkg = 'spark' diff --git a/provisioner/docker/config_ubuntu-24.04.yaml b/provisioner/docker/config_ubuntu-24.04.yaml index 5478178648..21a577fe9b 100644 --- a/provisioner/docker/config_ubuntu-24.04.yaml +++ b/provisioner/docker/config_ubuntu-24.04.yaml @@ -19,6 +19,6 @@ docker: repo: "http://repos.bigtop.apache.org/releases/3.5.0/ubuntu/24.04/$(ARCH)" distro: debian -components: [hdfs, yarn, mapreduce] +components: [hdfs, yarn, mapreduce, nutch] enable_local_repo: false -smoke_test_components: [hdfs, yarn, mapreduce] +smoke_test_components: [hdfs, yarn, mapreduce, nutch] From 5f0ce1e07fdab5e1edaae93120b4462ec6bf6e43 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:20:56 -0800 Subject: [PATCH 02/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-ci/build.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bigtop-ci/build.sh b/bigtop-ci/build.sh index dec4e6ccb0..77d9072a10 100755 --- a/bigtop-ci/build.sh +++ b/bigtop-ci/build.sh @@ -109,7 +109,12 @@ fi # Start up build container CONTAINER_ID=`docker run -d $DOCKER_RUN_OPTION $NEXUS $IMAGE_NAME /sbin/init` -trap "docker rm -f $CONTAINER_ID" EXIT +trap '[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID' EXIT + +if [ -z "$CONTAINER_ID" ]; then + echo "Failed to start Docker container (e.g. permission denied). Ensure the user is in the docker group." + exit 1 +fi # Copy bigtop repo into container docker cp $BIGTOP_HOME $CONTAINER_ID:/bigtop @@ -124,7 +129,7 @@ RESULT=$? mkdir -p output docker cp $CONTAINER_ID:/bigtop/build . docker cp $CONTAINER_ID:/bigtop/output . -docker rm -f $CONTAINER_ID +[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID if [ $RESULT -ne 0 ]; then exit 1 From e1f527f9580d008410dd216a956b9df6da51a0ac Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:37:05 -0800 Subject: [PATCH 03/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop_toolchain/manifests/packages.pp | 1 + 1 file changed, 1 insertion(+) diff --git a/bigtop_toolchain/manifests/packages.pp b/bigtop_toolchain/manifests/packages.pp index 3aec81bbd8..b3d666c5a5 100644 --- a/bigtop_toolchain/manifests/packages.pp +++ b/bigtop_toolchain/manifests/packages.pp @@ -260,6 +260,7 @@ "dh-make", "libfuse2", "libjansi-java", + "ant", "libxml2-dev", "libxslt1-dev", "zlib1g-dev", From af639212a8898796fd960f83004c2717bea99f8c Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:40:02 -0800 Subject: [PATCH 04/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages.gradle b/packages.gradle index e3c855329b..724a62b0d0 100644 --- a/packages.gradle +++ b/packages.gradle @@ -533,13 +533,14 @@ def genTasks = { target, packaging -> } // Deleting obsolete files delete fileTree(dir: "$DEB_BLD_DIR/debian", includes: ['*.ex', '*.EX', '*.~']) - // Creating source package + // Creating source package (-d for nutch: ant is provided by toolchain at /usr/local/ant, not by apt) + def dpkgBuildpackageArgs = (target == 'nutch') ? ['-uc', '-us', '-sa', '-S', '-d'] : ['-uc', '-us', '-sa', '-S'] exec { workingDir DEB_BLD_DIR environment 'PARENT_DIR', FULL_PARENT_DIR environment 'PKG_NAME_SUFFIX', DEB_PKG_NAME_SUFFIX environment 'bigtop_base_version', BIGTOP_BASE_VERSION - commandLine "dpkg-buildpackage -uc -us -sa -S".split(' ') + commandLine (['dpkg-buildpackage'] + dpkgBuildpackageArgs) } mkdir(PKG_OUTPUT_DIR) copy { From 2200a293806c7e9f93d74d5b7987cd638abfdc6d Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:42:12 -0800 Subject: [PATCH 05/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-packages/src/deb/nutch/compat | 1 + 1 file changed, 1 insertion(+) create mode 100644 bigtop-packages/src/deb/nutch/compat diff --git a/bigtop-packages/src/deb/nutch/compat b/bigtop-packages/src/deb/nutch/compat new file mode 100644 index 0000000000..f599e28b8a --- /dev/null +++ b/bigtop-packages/src/deb/nutch/compat @@ -0,0 +1 @@ +10 From 27dd2c751952de8003a29583699b6ffb5f7e036c Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:44:49 -0800 Subject: [PATCH 06/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages.gradle b/packages.gradle index 724a62b0d0..2db2a76331 100644 --- a/packages.gradle +++ b/packages.gradle @@ -392,6 +392,7 @@ def genTasks = { target, packaging -> } // Order of debuild parameters is important; hence specifying explicitely rather // than in an array of args + def debuildDepsOpt = (target == 'nutch') ? ' -d' : '' def command = """debuild \ --preserve-envvar PATH \ --preserve-envvar MAVEN3_HOME \ @@ -405,7 +406,7 @@ def genTasks = { target, packaging -> --set-envvar=PARENT_DIR=$FULL_PARENT_DIR \ --set-envvar=PKG_NAME_SUFFIX=$DEB_PKG_NAME_SUFFIX \ --set-envvar=bigtop_base_version=${BIGTOP_BASE_VERSION} \ --uc -us -b +-uc -us -b${debuildDepsOpt} """ exec { workingDir "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION" From 1bed6ddfd20d44d2b6472e0c542edd9a9c11e89d Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:47:08 -0800 Subject: [PATCH 07/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/packages.gradle b/packages.gradle index 2db2a76331..e02b716d7f 100644 --- a/packages.gradle +++ b/packages.gradle @@ -386,6 +386,7 @@ def genTasks = { target, packaging -> def final DEB_PKG_NAME_SUFFIX = getPkgNameSuffix(config.bigtop.components[target], "deb").pkgNameSuffix def final BIGTOP_BASE_VERSION = "${config.bigtop.base_version}" + delete "$PKG_OUTPUT_DIR/$PKG_NAME-$PKG_VERSION" exec { workingDir PKG_OUTPUT_DIR commandLine "dpkg-source -x $SRCDEB".split(' ') From b5d92a551b4743ea634c366908171ba8e1d00618 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:55:11 -0800 Subject: [PATCH 08/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages.gradle b/packages.gradle index e02b716d7f..2adbc0f420 100644 --- a/packages.gradle +++ b/packages.gradle @@ -393,14 +393,21 @@ def genTasks = { target, packaging -> } // Order of debuild parameters is important; hence specifying explicitely rather // than in an array of args + def debArch = 'amd64' + try { + def proc = 'dpkg --print-architecture'.execute() + proc.waitFor() + if (proc.exitValue() == 0 && proc.text?.trim()) debArch = proc.text.trim() + } catch (Exception ignore) {} def debuildDepsOpt = (target == 'nutch') ? ' -d' : '' + def nutchJava11Env = (target == 'nutch') ? "--set-envvar=JAVA_HOME=/usr/lib/jvm/java-11-openjdk-${debArch} \\\n" : '' def command = """debuild \ --preserve-envvar PATH \ --preserve-envvar MAVEN3_HOME \ --preserve-envvar MAVEN_OPTS \ --preserve-envvar JAVA_HOME \ --preserve-envvar BIGTOP_JDK \ ---set-envvar=HADOOP_VERSION=$HADOOP_VERSION \ +${nutchJava11Env}--set-envvar=HADOOP_VERSION=$HADOOP_VERSION \ --set-envvar=${toOldStyleName(target)}_BASE_VERSION=$BASE_VERSION \ --set-envvar=${toOldStyleName(target)}_VERSION=$PKG_VERSION \ --set-envvar=${toOldStyleName(target)}_RELEASE=$BIGTOP_BUILD_STAMP \ From ef2d739d2eb1555decd7a17ec2eb20cf472fc430 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 28 Feb 2026 19:56:37 -0800 Subject: [PATCH 09/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages.gradle b/packages.gradle index 2adbc0f420..d732124b86 100644 --- a/packages.gradle +++ b/packages.gradle @@ -400,7 +400,7 @@ def genTasks = { target, packaging -> if (proc.exitValue() == 0 && proc.text?.trim()) debArch = proc.text.trim() } catch (Exception ignore) {} def debuildDepsOpt = (target == 'nutch') ? ' -d' : '' - def nutchJava11Env = (target == 'nutch') ? "--set-envvar=JAVA_HOME=/usr/lib/jvm/java-11-openjdk-${debArch} \\\n" : '' + def nutchJava11Env = (target == 'nutch') ? "--set-envvar=JAVA_HOME=/usr/lib/jvm/java-11-openjdk-${debArch} " : '' def command = """debuild \ --preserve-envvar PATH \ --preserve-envvar MAVEN3_HOME \ From 680087754cd3a2b84663987092144ac098a85c4b Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 01:14:12 -0800 Subject: [PATCH 10/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-packages/src/common/nutch/install_nutch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtop-packages/src/common/nutch/install_nutch.sh b/bigtop-packages/src/common/nutch/install_nutch.sh index 8de99be096..1bf322cba3 100644 --- a/bigtop-packages/src/common/nutch/install_nutch.sh +++ b/bigtop-packages/src/common/nutch/install_nutch.sh @@ -77,7 +77,7 @@ install -d -m 0755 "$(dirname "$CONF_DIR")" install -d -m 0755 "$CONF_DIR/conf.dist" cp -a "$RUNTIME_LOCAL/conf/"* "$CONF_DIR/conf.dist/" 2>/dev/null || true -install -d -m 0755 "$(dirname "$ETC_DEFAULT")" +install -d -m 0755 "$ETC_DEFAULT" install -m 0644 "$DISTRO_DIR/nutch.default" "$ETC_DEFAULT/nutch" # Wrapper script for /usr/bin/nutch From 202a50c75de4ffab62b9ab95e3fa21d9a0b19ca3 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 01:26:37 -0800 Subject: [PATCH 11/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-packages/src/common/nutch/install_nutch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bigtop-packages/src/common/nutch/install_nutch.sh b/bigtop-packages/src/common/nutch/install_nutch.sh index 1bf322cba3..608991fa68 100644 --- a/bigtop-packages/src/common/nutch/install_nutch.sh +++ b/bigtop-packages/src/common/nutch/install_nutch.sh @@ -78,6 +78,7 @@ install -d -m 0755 "$CONF_DIR/conf.dist" cp -a "$RUNTIME_LOCAL/conf/"* "$CONF_DIR/conf.dist/" 2>/dev/null || true install -d -m 0755 "$ETC_DEFAULT" +mkdir -p "$ETC_DEFAULT" install -m 0644 "$DISTRO_DIR/nutch.default" "$ETC_DEFAULT/nutch" # Wrapper script for /usr/bin/nutch From a445ab562bc40fe49196e61892fb9f0093911ff5 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 10:58:08 -0800 Subject: [PATCH 12/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy index c91ebfbc4c..85910b7a91 100644 --- a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy +++ b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy @@ -60,7 +60,7 @@ class TestNutchSmoke { void testNutchInjectSubcommand() { sh.exec("${NUTCH_CMD} inject") assertTrue("nutch inject without args should fail with non-zero exit", sh.getRet() != 0) - String out = (sh.getOut() + " " + sh.getErr()).toLowerCase() + String out = (sh.getOut().toString() + " " + sh.getErr().toString()).toLowerCase() assertTrue("nutch inject should print usage or error (got: " + out + ")", out.contains("inject") || out.contains("usage") || out.contains("argument")) } @@ -71,7 +71,7 @@ class TestNutchSmoke { sh.exec("${NUTCH_CMD} readdb ${HDFS_BASE}/crawldb -stats") assertTrue("nutch readdb -stats (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) - String statsOut = sh.getOut() + " " + sh.getErr() + String statsOut = sh.getOut().toString() + " " + sh.getErr().toString() assertTrue("readdb output should show url/crawldb stats (got: " + statsOut + ")", statsOut.contains("url") || statsOut.contains("Number") || statsOut.contains("count") || statsOut.contains("1")) } @@ -81,6 +81,6 @@ class TestNutchSmoke { assertTrue("nutch generate (HDFS) failed: " + sh.getErr(), sh.getRet() == 0) sh.exec("hadoop fs -ls ${HDFS_BASE}/segments") - assertTrue("generate should create at least one segment under ${HDFS_BASE}/segments: " + sh.getErr(), sh.getRet() == 0 && (sh.getOut() + sh.getErr()).trim().length() > 0) + assertTrue("generate should create at least one segment under ${HDFS_BASE}/segments: " + sh.getErr(), sh.getRet() == 0 && (sh.getOut().toString() + sh.getErr().toString()).trim().length() > 0) } } From 796e170f4b0c3f65e47e8707eca1ec196f968f5c Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 11:23:43 -0800 Subject: [PATCH 13/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages.gradle b/packages.gradle index d732124b86..85ea9cb512 100644 --- a/packages.gradle +++ b/packages.gradle @@ -998,7 +998,7 @@ task "apt" ( }.each { changeFile -> exec { workingDir BUILD_DIR - commandLine "reprepro -Vb $OUTPUT_DIR/apt include bigtop $changeFile".split(' ') + commandLine "/usr/bin/reprepro", "-Vb", "$OUTPUT_DIR/apt", "include", "bigtop", changeFile.absolutePath } } } From ac1cbdcbacf4030a0b53c2160a9abed7c569ab22 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 11:28:10 -0800 Subject: [PATCH 14/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- packages.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/packages.gradle b/packages.gradle index 85ea9cb512..0e9c24466d 100644 --- a/packages.gradle +++ b/packages.gradle @@ -988,6 +988,7 @@ task "apt" ( description: "Creating APT repository", group: PACKAGES_GROUP) doLast { + file(BUILD_DIR).mkdirs() delete ( "$OUTPUT_DIR/apt") mkdir ("$OUTPUT_DIR/apt/conf") From 22ec46322a5e94e231e591b4892dd66495374b9d Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 11:44:45 -0800 Subject: [PATCH 15/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- provisioner/docker/docker-hadoop.sh | 1 + provisioner/utils/setup-env-debian.sh | 3 +++ provisioner/utils/smoke-tests.sh | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/provisioner/docker/docker-hadoop.sh b/provisioner/docker/docker-hadoop.sh index 38ece152d0..94a5ad6da6 100755 --- a/provisioner/docker/docker-hadoop.sh +++ b/provisioner/docker/docker-hadoop.sh @@ -169,6 +169,7 @@ bigtop::hadoop_head_node: $1 hadoop::hadoop_storage_dirs: [/data/1, /data/2] bigtop::bigtop_repo_uri: $2 bigtop::bigtop_repo_gpg_check: $gpg_check +hadoop::hadoop_java_home: "/usr/lib/jvm/java-11-openjdk-amd64" hadoop_cluster_node::cluster_components: $3 hadoop_cluster_node::cluster_nodes: [$node_list] hadoop::common_yarn::yarn_resourcemanager_scheduler_class: org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler diff --git a/provisioner/utils/setup-env-debian.sh b/provisioner/utils/setup-env-debian.sh index ccfee9ef8d..efcb3dca98 100755 --- a/provisioner/utils/setup-env-debian.sh +++ b/provisioner/utils/setup-env-debian.sh @@ -27,6 +27,9 @@ service rng-tools start # The testing process would be broken due to "No such file or derictory: /etc/default/locale" in ubuntu16.04. apt-get install -y locales +# OpenJDK 11 for components that require it (e.g. Nutch 1.22, class file version 55.0) +apt-get install -y openjdk-11-jdk + if [ $enable_local_repo == "true" ]; then echo "deb file:///bigtop-home/output/apt bigtop contrib" > /etc/apt/sources.list.d/bigtop-home_output.list # In BIGTOP-2796 repo installed by puppet has priority 900, here we set higher priority for local repo diff --git a/provisioner/utils/smoke-tests.sh b/provisioner/utils/smoke-tests.sh index 8a04ad3248..0791fa4c29 100755 --- a/provisioner/utils/smoke-tests.sh +++ b/provisioner/utils/smoke-tests.sh @@ -22,6 +22,11 @@ if [ -z "$SMOKE_TESTS" ]; then exit 2 fi +# Prefer Java 11 when available (required for Nutch 1.22 and other components with class file 55.0) +if [ -d /usr/lib/jvm/java-11-openjdk-amd64 ]; then + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +fi + # Autodetect JAVA_HOME if [ -e /usr/lib/bigtop-utils/bigtop-detect-javahome ]; then . /usr/lib/bigtop-utils/bigtop-detect-javahome From 57b2b419778b3fe6c6482ad24a1b374187c5e175 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 11:55:39 -0800 Subject: [PATCH 16/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- provisioner/utils/setup-env-debian.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/provisioner/utils/setup-env-debian.sh b/provisioner/utils/setup-env-debian.sh index efcb3dca98..734d6fbdc6 100755 --- a/provisioner/utils/setup-env-debian.sh +++ b/provisioner/utils/setup-env-debian.sh @@ -27,6 +27,15 @@ service rng-tools start # The testing process would be broken due to "No such file or derictory: /etc/default/locale" in ubuntu16.04. apt-get install -y locales +# Enable universe on Ubuntu so openjdk-11-jdk is available (e.g. for Nutch 1.22) +if command -v lsb_release >/dev/null 2>&1 && [ "$(lsb_release -is 2>/dev/null)" = "Ubuntu" ]; then + release=$(lsb_release -sc 2>/dev/null) + if [ -n "$release" ] && [ ! -f /etc/apt/sources.list.d/universe.list ]; then + echo "deb http://archive.ubuntu.com/ubuntu $release universe" > /etc/apt/sources.list.d/universe.list + apt-get update + fi +fi + # OpenJDK 11 for components that require it (e.g. Nutch 1.22, class file version 55.0) apt-get install -y openjdk-11-jdk From eac4490f42278bee052b77e5b3737137e946a561 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 12:05:24 -0800 Subject: [PATCH 17/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- provisioner/utils/setup-env-debian.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/provisioner/utils/setup-env-debian.sh b/provisioner/utils/setup-env-debian.sh index 734d6fbdc6..a2c8c1ba48 100755 --- a/provisioner/utils/setup-env-debian.sh +++ b/provisioner/utils/setup-env-debian.sh @@ -28,10 +28,11 @@ service rng-tools start apt-get install -y locales # Enable universe on Ubuntu so openjdk-11-jdk is available (e.g. for Nutch 1.22) +# Use same Signed-By as main Ubuntu sources to avoid "Conflicting values set for option Signed-By" if command -v lsb_release >/dev/null 2>&1 && [ "$(lsb_release -is 2>/dev/null)" = "Ubuntu" ]; then release=$(lsb_release -sc 2>/dev/null) if [ -n "$release" ] && [ ! -f /etc/apt/sources.list.d/universe.list ]; then - echo "deb http://archive.ubuntu.com/ubuntu $release universe" > /etc/apt/sources.list.d/universe.list + echo "deb [signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://archive.ubuntu.com/ubuntu $release universe" > /etc/apt/sources.list.d/universe.list apt-get update fi fi From 4c53f55d3819716bf37d471c5d8e3e268a40ca97 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 12:19:28 -0800 Subject: [PATCH 18/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy index 85910b7a91..fc7a2f0653 100644 --- a/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy +++ b/bigtop-tests/smoke-tests/nutch/TestNutchSmoke.groovy @@ -50,10 +50,12 @@ class TestNutchSmoke { sh.exec("hadoop fs -rm -r -f ${HDFS_BASE} 2>/dev/null || true") } - @Test + @Test(timeout = 15000L) void testNutchUsage() { - sh.exec(NUTCH_CMD) - assertTrue("nutch usage failed: " + sh.getErr(), sh.getRet() == 0) + sh.exec("${NUTCH_CMD} showproperties") + assertTrue("nutch showproperties failed: " + sh.getErr(), sh.getRet() == 0) + String out = sh.getOut().toString() + " " + sh.getErr().toString() + assertTrue("nutch showproperties should print config (got: " + out + ")", out.contains("=") || out.contains("nutch")) } @Test From fefe706651d642af118396b8b8b8d30c183b318c Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 12:33:48 -0800 Subject: [PATCH 19/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-deploy/puppet/modules/hadoop/manifests/init.pp | 6 ++++-- bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp index 164235dcec..594838c56e 100644 --- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp +++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp @@ -164,6 +164,8 @@ $hadoop_security_authentication = $hadoop::hadoop_security_authentication, $kerberos_realm = $hadoop::kerberos_realm, $yarn_nodemanager_vmem_check_enabled = undef, + # Ensure MR Application Master has log4j config (avoids exit code 1 in YARN containers) + $yarn_app_mapreduce_am_command_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", ) inherits hadoop { include hadoop::common @@ -429,8 +431,8 @@ $mapreduce_job_reduce_slowstart_completedmaps = undef, $mapreduce_map_memory_mb = undef, $mapreduce_reduce_memory_mb = undef, - $mapreduce_map_java_opts = "-Xmx1024m", - $mapreduce_reduce_java_opts = "-Xmx1024m", + $mapreduce_map_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", + $mapreduce_reduce_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties", $hadoop_security_authentication = $hadoop::hadoop_security_authentication, $kerberos_realm = $hadoop::kerberos_realm, ) inherits hadoop { diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml index d14eb3bb2c..d5de4af74a 100644 --- a/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml +++ b/bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml @@ -211,6 +211,12 @@ $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/* + + + JVM options for the MapReduce Application Master (includes log4j config for container logs). + yarn.app.mapreduce.am.command-opts + <%= @yarn_app_mapreduce_am_command_opts %> + <% if @yarn_scheduler_minimum_allocation_mb -%> From 2b5ea60f87d53b91c4aa9c2cfa3c92381cddf7de Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 12:43:27 -0800 Subject: [PATCH 20/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- .../src/common/hadoop/conf.secure/log4j.properties | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties index f75a299775..ba87f915dc 100644 --- a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties +++ b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties @@ -17,6 +17,8 @@ hadoop.root.logger=INFO,console hadoop.log.dir=. hadoop.log.file=hadoop.log +# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) +yarn.app.container.log.dir=/tmp # Define the root logger to the system property "hadoop.root.logger". log4j.rootLogger=${hadoop.root.logger}, EventCounter @@ -41,6 +43,18 @@ log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n # Debugging Pattern format #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n +# +# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). +# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. +# Must be defined so rootLogger can use CLA when this file is used for container log4j. +# +log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender +log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} +log4j.appender.CLA.containerLogFile=syslog +log4j.appender.CLA.totalLogFileSize=10485760 +log4j.appender.CLA.layout=org.apache.log4j.PatternLayout +log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n + # # console # Add "console" to rootlogger above if you want to use this From 5c8aa85672aae79d2debf9c9fdf27d7fd99baa30 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 13:04:47 -0800 Subject: [PATCH 21/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-packages/src/common/hadoop/install_hadoop.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigtop-packages/src/common/hadoop/install_hadoop.sh b/bigtop-packages/src/common/hadoop/install_hadoop.sh index 966890f007..f2d4da920c 100755 --- a/bigtop-packages/src/common/hadoop/install_hadoop.sh +++ b/bigtop-packages/src/common/hadoop/install_hadoop.sh @@ -391,6 +391,8 @@ cp ${DISTRO_DIR}/conf.empty/mapred-site.xml $PREFIX/$ETC_HADOOP/conf.empty sed -i -e '/^[^#]/s,^,#,' ${BUILD_DIR}/etc/hadoop/hadoop-env.sh cp -r ${BUILD_DIR}/etc/hadoop/* $PREFIX/$ETC_HADOOP/conf.empty rm -rf $PREFIX/$ETC_HADOOP/conf.empty/*.cmd +# Overwrite with Bigtop log4j.properties (includes CLA for YARN containers, e.g. MR AM) +cp $(dirname $0)/../conf.secure/log4j.properties $PREFIX/$ETC_HADOOP/conf.empty/log4j.properties # Install default wrapper install -d -m 0755 $PREFIX/$ETC_DEFAULT From 715b6bc8f15f26813c93f4bb6845e4876e8b4808 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 13:38:01 -0800 Subject: [PATCH 22/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- bigtop-deploy/puppet/modules/hadoop/manifests/init.pp | 6 ++++++ bigtop-packages/src/common/hadoop/install_hadoop.sh | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp index 594838c56e..ac62152f3f 100644 --- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp +++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp @@ -128,6 +128,12 @@ require => [Package["hadoop"]], } + # Deploy log4j.properties with CLA appender so YARN containers (e.g. MR AM) can use it + file { "/etc/hadoop/conf/log4j.properties": + source => 'puppet:///modules/hadoop/log4j.properties', + require => [Package["hadoop"]], + } + package { "hadoop": ensure => latest, require => Package["jdk"], diff --git a/bigtop-packages/src/common/hadoop/install_hadoop.sh b/bigtop-packages/src/common/hadoop/install_hadoop.sh index f2d4da920c..4ec5ca1a6d 100755 --- a/bigtop-packages/src/common/hadoop/install_hadoop.sh +++ b/bigtop-packages/src/common/hadoop/install_hadoop.sh @@ -392,7 +392,11 @@ sed -i -e '/^[^#]/s,^,#,' ${BUILD_DIR}/etc/hadoop/hadoop-env.sh cp -r ${BUILD_DIR}/etc/hadoop/* $PREFIX/$ETC_HADOOP/conf.empty rm -rf $PREFIX/$ETC_HADOOP/conf.empty/*.cmd # Overwrite with Bigtop log4j.properties (includes CLA for YARN containers, e.g. MR AM) -cp $(dirname $0)/../conf.secure/log4j.properties $PREFIX/$ETC_HADOOP/conf.empty/log4j.properties +# If not found (e.g. DEB build layout), Puppet deploys it at provision time. +BIGTOP_LOG4J="$(dirname $0)/../conf.secure/log4j.properties" +if [ -f "$BIGTOP_LOG4J" ]; then + cp "$BIGTOP_LOG4J" $PREFIX/$ETC_HADOOP/conf.empty/log4j.properties +fi # Install default wrapper install -d -m 0755 $PREFIX/$ETC_DEFAULT From 16f4e8d5fea0dc298c6f3c1a68b198b49c53ef2e Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 13:50:09 -0800 Subject: [PATCH 23/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- .../modules/hadoop/files/log4j.properties | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 bigtop-deploy/puppet/modules/hadoop/files/log4j.properties diff --git a/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties new file mode 100644 index 0000000000..ba87f915dc --- /dev/null +++ b/bigtop-deploy/puppet/modules/hadoop/files/log4j.properties @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log +# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) +yarn.app.container.log.dir=/tmp + +# Define the root logger to the system property "hadoop.root.logger". +log4j.rootLogger=${hadoop.root.logger}, EventCounter + +# Logging Threshold +log4j.threshhold=ALL + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). +# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. +# Must be defined so rootLogger can use CLA when this file is used for container log4j. +# +log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender +log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} +log4j.appender.CLA.containerLogFile=syslog +log4j.appender.CLA.totalLogFileSize=10485760 +log4j.appender.CLA.layout=org.apache.log4j.PatternLayout +log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# +# TaskLog Appender +# + +#Default values +hadoop.tasklog.taskid=null +hadoop.tasklog.noKeepSplits=4 +hadoop.tasklog.totalLogFileSize=100 +hadoop.tasklog.purgeLogSplits=true +hadoop.tasklog.logsRetainHours=12 +hadoop.tasklog.iscleanup=false + +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender +log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} +log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} + +log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup} + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter + +#======= +# security audit logging + +security.audit.logger=INFO,console +log4j.category.SecurityLogger=${security.audit.logger} +log4j.additivity.SecurityLogger=false +log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log +log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd + +# hdfs audit logging + +hdfs.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} +log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false +log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log +log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd + + +# mapred audit logging + +mapred.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger} +log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false +log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log +log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd + + +# Mapred job summary + +mapred.jobsummary.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger} +log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false +log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log +log4j.appender.JSA.layout=org.apache.log4j.PatternLayout +log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.JSA.DatePattern=.yyyy-MM-dd From 8ee3c51adf267d79c63f12d0c1a0c4203390a84e Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 14:18:33 -0800 Subject: [PATCH 24/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- .../puppet/modules/hadoop/manifests/init.pp | 2 +- .../hadoop/templates/log4j.properties.erb | 140 ++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp index ac62152f3f..e31bc2b29f 100644 --- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp +++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp @@ -130,7 +130,7 @@ # Deploy log4j.properties with CLA appender so YARN containers (e.g. MR AM) can use it file { "/etc/hadoop/conf/log4j.properties": - source => 'puppet:///modules/hadoop/log4j.properties', + content => template('hadoop/log4j.properties.erb'), require => [Package["hadoop"]], } diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb new file mode 100644 index 0000000000..e0c841d410 --- /dev/null +++ b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log +# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) +yarn.app.container.log.dir=/tmp + +# Root logger: only console (EventCounter/CLA not on YARN container classpath; causes AM ClassNotFoundException) +log4j.rootLogger=${hadoop.root.logger} + +# Logging Threshold +log4j.threshhold=ALL + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). +# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. +# Must be defined so rootLogger can use CLA when this file is used for container log4j. +# +log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender +log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} +log4j.appender.CLA.containerLogFile=syslog +log4j.appender.CLA.totalLogFileSize=10485760 +log4j.appender.CLA.layout=org.apache.log4j.PatternLayout +log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# +# TaskLog Appender +# + +#Default values +hadoop.tasklog.taskid=null +hadoop.tasklog.noKeepSplits=4 +hadoop.tasklog.totalLogFileSize=100 +hadoop.tasklog.purgeLogSplits=true +hadoop.tasklog.logsRetainHours=12 +hadoop.tasklog.iscleanup=false + +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender +log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} +log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} + +log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup} + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter + +#======= +# security audit logging + +security.audit.logger=INFO,console +log4j.category.SecurityLogger=${security.audit.logger} +log4j.additivity.SecurityLogger=false +log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log +log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd + +# hdfs audit logging + +hdfs.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger} +log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false +log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log +log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd + + +# mapred audit logging + +mapred.audit.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger} +log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false +log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender +log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log +log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout +log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd + + +# Mapred job summary + +mapred.jobsummary.logger=INFO,console +log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger} +log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false +log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log +log4j.appender.JSA.layout=org.apache.log4j.PatternLayout +log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n +log4j.appender.JSA.DatePattern=.yyyy-MM-dd From 63083253e8c065aeb1a576c2aed7b4461cdc5d34 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 1 Mar 2026 14:28:49 -0800 Subject: [PATCH 25/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- .../modules/hadoop/templates/log4j.properties.erb | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb index e0c841d410..4a14263e09 100644 --- a/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb +++ b/bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb @@ -17,8 +17,6 @@ hadoop.root.logger=INFO,console hadoop.log.dir=. hadoop.log.file=hadoop.log -# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) -yarn.app.container.log.dir=/tmp # Root logger: only console (EventCounter/CLA not on YARN container classpath; causes AM ClassNotFoundException) log4j.rootLogger=${hadoop.root.logger} @@ -43,17 +41,7 @@ log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n # Debugging Pattern format #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). -# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. -# Must be defined so rootLogger can use CLA when this file is used for container log4j. -# -log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender -log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} -log4j.appender.CLA.containerLogFile=syslog -log4j.appender.CLA.totalLogFileSize=10485760 -log4j.appender.CLA.layout=org.apache.log4j.PatternLayout -log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n +# CLA (ContainerLogAppender) omitted: class not on MR AM container classpath, causes ClassNotFoundException # # console From 4b191a2bcde2e97f6847570330e808c78e4082f9 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 5 Mar 2026 17:55:14 -0800 Subject: [PATCH 26/26] BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem --- .../puppet/modules/hadoop/manifests/init.pp | 2 +- .../common/hadoop/conf.secure/log4j.properties | 17 +++-------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp index e31bc2b29f..bb704702f0 100644 --- a/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp +++ b/bigtop-deploy/puppet/modules/hadoop/manifests/init.pp @@ -128,7 +128,7 @@ require => [Package["hadoop"]], } - # Deploy log4j.properties with CLA appender so YARN containers (e.g. MR AM) can use it + # Deploy log4j.properties for YARN containers (no CLA appender; container-safe). file { "/etc/hadoop/conf/log4j.properties": content => template('hadoop/log4j.properties.erb'), require => [Package["hadoop"]], diff --git a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties index ba87f915dc..23ed3bfc87 100644 --- a/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties +++ b/bigtop-packages/src/common/hadoop/conf.secure/log4j.properties @@ -17,11 +17,10 @@ hadoop.root.logger=INFO,console hadoop.log.dir=. hadoop.log.file=hadoop.log -# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers) -yarn.app.container.log.dir=/tmp # Define the root logger to the system property "hadoop.root.logger". -log4j.rootLogger=${hadoop.root.logger}, EventCounter +# Container-safe: no CLA (ContainerLogAppender) or EventCounter in rootLogger. +log4j.rootLogger=${hadoop.root.logger} # Logging Threshold log4j.threshhold=ALL @@ -43,17 +42,7 @@ log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n # Debugging Pattern format #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n -# -# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master). -# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers. -# Must be defined so rootLogger can use CLA when this file is used for container log4j. -# -log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender -log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} -log4j.appender.CLA.containerLogFile=syslog -log4j.appender.CLA.totalLogFileSize=10485760 -log4j.appender.CLA.layout=org.apache.log4j.PatternLayout -log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n +# CLA (ContainerLogAppender) omitted: not on MR AM container classpath; causes errors in YARN containers. # # console