Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
09f386e
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Feb 28, 2026
5f0ce1e
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
e1f527f
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
af63921
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
2200a29
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
27dd2c7
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
1bed6dd
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
b5d92a5
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
ef2d739
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
6800877
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
202a50c
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
a445ab5
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
796e170
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
ac1cbdc
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
22ec463
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
57b2b41
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
eac4490
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
4c53f55
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
fefe706
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
2b5ea60
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
5c8aa85
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
715b6bc
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
16f4e8d
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
8ee3c51
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
6308325
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 1, 2026
4b191a2
BIGTOP-284 Integrate Apache Nutch into the Apache Bigtop ecosystem
lewismc Mar 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions bigtop-ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,12 @@ fi

# Start up build container
CONTAINER_ID=`docker run -d $DOCKER_RUN_OPTION $NEXUS $IMAGE_NAME /sbin/init`
trap "docker rm -f $CONTAINER_ID" EXIT
trap '[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID' EXIT

if [ -z "$CONTAINER_ID" ]; then
echo "Failed to start Docker container (e.g. permission denied). Ensure the user is in the docker group."
exit 1
fi

# Copy bigtop repo into container
docker cp $BIGTOP_HOME $CONTAINER_ID:/bigtop
Expand All @@ -124,7 +129,7 @@ RESULT=$?
mkdir -p output
docker cp $CONTAINER_ID:/bigtop/build .
docker cp $CONTAINER_ID:/bigtop/output .
docker rm -f $CONTAINER_ID
[ -n "$CONTAINER_ID" ] && docker rm -f $CONTAINER_ID

if [ $RESULT -ne 0 ]; then
exit 1
Expand Down
4 changes: 4 additions & 0 deletions bigtop-deploy/puppet/manifests/cluster.pp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@
solr => {
worker => ["solr-server"],
},
nutch => {
client => ["nutch-client"],
},
spark => {
worker => ["spark-on-yarn"],
client => ["spark-client"],
Expand Down Expand Up @@ -171,6 +174,7 @@
"hadoop_zookeeper",
"hcatalog",
"livy",
"nutch",
"solr",
"spark",
"tez",
Expand Down
140 changes: 140 additions & 0 deletions bigtop-deploy/puppet/modules/hadoop/files/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Define some default values that can be overridden by system properties
hadoop.root.logger=INFO,console
hadoop.log.dir=.
hadoop.log.file=hadoop.log
# Default for YARN container log dir (NodeManager sets yarn.app.container.log.dir when launching containers)
yarn.app.container.log.dir=/tmp

# Define the root logger to the system property "hadoop.root.logger".
log4j.rootLogger=${hadoop.root.logger}, EventCounter

# Logging Threshold
log4j.threshhold=ALL

#
# Daily Rolling File Appender
#

log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
log4j.appender.DRFA.DatePattern=.yyyy-MM-dd

# 30-day backup
#log4j.appender.DRFA.MaxBackupIndex=30
log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout

# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n

#
# ContainerLogAppender (CLA) - used by YARN containers (e.g. MR Application Master).
# YARN NodeManager sets yarn.app.container.log.dir etc. when launching containers.
# Must be defined so rootLogger can use CLA when this file is used for container log4j.
#
log4j.appender.CLA=org.apache.hadoop.yarn.util.ContainerLogAppender
log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir}
log4j.appender.CLA.containerLogFile=syslog
log4j.appender.CLA.totalLogFileSize=10485760
log4j.appender.CLA.layout=org.apache.log4j.PatternLayout
log4j.appender.CLA.layout.ConversionPattern=%d{ISO8601} %p [%t] %c: %m%n

#
# console
# Add "console" to rootlogger above if you want to use this
#

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n

#
# TaskLog Appender
#

#Default values
hadoop.tasklog.taskid=null
hadoop.tasklog.noKeepSplits=4
hadoop.tasklog.totalLogFileSize=100
hadoop.tasklog.purgeLogSplits=true
hadoop.tasklog.logsRetainHours=12
hadoop.tasklog.iscleanup=false

log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}

log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}

log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n

#
# Event Counter Appender
# Sends counts of logging messages at different severity levels to Hadoop Metrics.
#
log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter

#=======
# security audit logging

security.audit.logger=INFO,console
log4j.category.SecurityLogger=${security.audit.logger}
log4j.additivity.SecurityLogger=false
log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log
log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd

# hdfs audit logging

hdfs.audit.logger=INFO,console
log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log
log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd


# mapred audit logging

mapred.audit.logger=INFO,console
log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender
log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log
log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd


# Mapred job summary

mapred.jobsummary.logger=INFO,console
log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger}
log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log
log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.JSA.DatePattern=.yyyy-MM-dd
12 changes: 10 additions & 2 deletions bigtop-deploy/puppet/modules/hadoop/manifests/init.pp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@
require => [Package["hadoop"]],
}

# Deploy log4j.properties for YARN containers (no CLA appender; container-safe).
file { "/etc/hadoop/conf/log4j.properties":
content => template('hadoop/log4j.properties.erb'),
require => [Package["hadoop"]],
}

package { "hadoop":
ensure => latest,
require => Package["jdk"],
Expand Down Expand Up @@ -164,6 +170,8 @@
$hadoop_security_authentication = $hadoop::hadoop_security_authentication,
$kerberos_realm = $hadoop::kerberos_realm,
$yarn_nodemanager_vmem_check_enabled = undef,
# Ensure MR Application Master has log4j config (avoids exit code 1 in YARN containers)
$yarn_app_mapreduce_am_command_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
) inherits hadoop {

include hadoop::common
Expand Down Expand Up @@ -429,8 +437,8 @@
$mapreduce_job_reduce_slowstart_completedmaps = undef,
$mapreduce_map_memory_mb = undef,
$mapreduce_reduce_memory_mb = undef,
$mapreduce_map_java_opts = "-Xmx1024m",
$mapreduce_reduce_java_opts = "-Xmx1024m",
$mapreduce_map_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
$mapreduce_reduce_java_opts = "-Xmx1024m -Dlog4j.configuration=file:///etc/hadoop/conf/log4j.properties",
$hadoop_security_authentication = $hadoop::hadoop_security_authentication,
$kerberos_realm = $hadoop::kerberos_realm,
) inherits hadoop {
Expand Down
128 changes: 128 additions & 0 deletions bigtop-deploy/puppet/modules/hadoop/templates/log4j.properties.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Define some default values that can be overridden by system properties
hadoop.root.logger=INFO,console
hadoop.log.dir=.
hadoop.log.file=hadoop.log

# Root logger: only console (EventCounter/CLA not on YARN container classpath; causes AM ClassNotFoundException)
log4j.rootLogger=${hadoop.root.logger}

# Logging Threshold
log4j.threshhold=ALL

#
# Daily Rolling File Appender
#

log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
log4j.appender.DRFA.DatePattern=.yyyy-MM-dd

# 30-day backup
#log4j.appender.DRFA.MaxBackupIndex=30
log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout

# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n

# CLA (ContainerLogAppender) omitted: class not on MR AM container classpath, causes ClassNotFoundException

#
# console
# Add "console" to rootlogger above if you want to use this
#

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n

#
# TaskLog Appender
#

#Default values
hadoop.tasklog.taskid=null
hadoop.tasklog.noKeepSplits=4
hadoop.tasklog.totalLogFileSize=100
hadoop.tasklog.purgeLogSplits=true
hadoop.tasklog.logsRetainHours=12
hadoop.tasklog.iscleanup=false

log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}

log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}

log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n

#
# Event Counter Appender
# Sends counts of logging messages at different severity levels to Hadoop Metrics.
#
log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter

#=======
# security audit logging

security.audit.logger=INFO,console
log4j.category.SecurityLogger=${security.audit.logger}
log4j.additivity.SecurityLogger=false
log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFAS.File=/var/local/hadoop/logs/${hadoop.id.str}/${hadoop.id.str}-auth.log
log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd

# hdfs audit logging

hdfs.audit.logger=INFO,console
log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
log4j.appender.DRFAAUDIT=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFAAUDIT.File=/var/local/hadoop/logs/hadoop-logs/hdfs-audit.log
log4j.appender.DRFAAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.DRFAAUDIT.DatePattern=.yyyy-MM-dd


# mapred audit logging

mapred.audit.logger=INFO,console
log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
log4j.appender.MRAUDIT=org.apache.log4j.DailyRollingFileAppender
log4j.appender.MRAUDIT.File=/var/local/hadoop/logs/hadoop-logs/mapred-audit.log
log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.MRAUDIT.DatePattern=.yyyy-MM-dd


# Mapred job summary

mapred.jobsummary.logger=INFO,console
log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${mapred.jobsummary.logger}
log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
log4j.appender.JSA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.JSA.File=${hadoop.log.dir}/mapred-jobsummary.log
log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
log4j.appender.JSA.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.JSA.DatePattern=.yyyy-MM-dd
6 changes: 6 additions & 0 deletions bigtop-deploy/puppet/modules/hadoop/templates/yarn-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,12 @@
$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*
</value>
</property>

<property>
<description>JVM options for the MapReduce Application Master (includes log4j config for container logs).</description>
<name>yarn.app.mapreduce.am.command-opts</name>
<value><%= @yarn_app_mapreduce_am_command_opts %></value>
</property>
<% if @yarn_scheduler_minimum_allocation_mb -%>

<property>
Expand Down
34 changes: 34 additions & 0 deletions bigtop-deploy/puppet/modules/nutch/manifests/init.pp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

class nutch {

class deploy ($roles) {
if ("nutch-client" in $roles) {
include nutch::client
}
}

class client {
package { "nutch":
ensure => latest,
}

file { "/etc/default/nutch":
content => template("nutch/nutch.default"),
require => Package["nutch"],
}
}
}
Loading