#!/bin/bash ############################################################################################# # # FILENAME: hadoop-setup.sh # # USAGE: "source hadoop-setup.sh" # # NOTE: This script sets multiple environment variables, so you MUST use # "source" to run the script. Otherwise, the shell in which you ran # this script will not pick up the new environment variables! # # DESCRIPTION: This script sets up hadoop on the ARC cluster. It assumes one master node # and N slave nodes, where N is the number of nodes you reserved using "srun". # The master node will be the lowest numbered node in your reservation. It # sets up the HDFS and also creates a "/user/UNITYID" directory inside the # HDFS, which is where any input/output will be read/written to. To make sure # this script ran successfully, run "hdfs dfs -ls /user". You should see the # "/user/UNITYID" directory that was created. # # AUTHOR: Tyler Stocksdale, Frank Mueller # DATE: 10/18/2017, 3/3/2024 # ############################################################################################# ORIG_DIR=`pwd` #Should already be in .bashrc #module load java export JAVA_HOME=/usr/lib/jvm/java/ #Set up new hadoop directory in the current directory rm -rf hadoop/ mkdir hadoop/ cd hadoop mkdir -p etc/hadoop mkdir bin cd bin ln -s /usr/local/hadoop/bin/* . cd .. mkdir libexec cd libexec ln -s /usr/local/hadoop/libexec/* . cd .. mkdir sbin cd sbin ln -s /usr/local/hadoop/sbin/* . cd .. ln -s /usr/local/hadoop/* . >& /dev/null #Set environment variables (These will not persist unless using "source" command!) export HADOOP_HOME=`pwd` export HADOOP_INSTALL=$HADOOP_HOME export HADOOP_MAPRED_HOME=$HADOOP_HOME export HADOOP_COMMON_HOME=$HADOOP_HOME export HADOOP_HDFS_HOME=$HADOOP_HOME export YARN_HOME=$HADOOP_HOME export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin export CLASSPATH=`hadoop classpath` export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native -Dlog4j.configuration=file:$HADOOP_CONF_DIR/log.properties" #Below files need to be put in etc/hadoop directory cd etc/hadoop #Create file workers echo $SLURM_NODELIST | tr -d c | tr -d [ | tr -d ] | perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg' | awk 'BEGIN { RS=","} { print "c"$1 }' > workers #Create file masters head -1 workers > masters MASTER=$(cat masters) #Create file core-site.xml echo " fs.defaultFS hdfs://"$MASTER":9000 " > core-site.xml #Create file hdfs-site.xml echo " dfs.replication 1 dfs.datanode.data.dir file:///tmp/"$USER"/hadoop/data dfs.namenode.name.dir file:///tmp/"$USER"/hadoop/name " > hdfs-site.xml #Create mapred-site.xml echo " mapreduce.framework.name yarn mapreduce.application.classpath "$CLASSPATH" " > mapred-site.xml #Create yarn-site.xml echo " yarn.nodemanager.aux-services mapreduce_shuffle yarn.resourcemanager.hostname "$MASTER" " > yarn-site.xml #Create hadoop-env.sh echo "export JAVA_HOME=$JAVA_HOME export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export HADOOP_OPTS=\"-Djava.library.path=$HADOOP_HOME/lib/native -Dlog4j.configuration=file:$HADOOP_CONF_DIR/log4j.properties\" " > hadoop-env.sh echo "log4j.rootLogger=ERROR, file #Define root logger options #log4j.rootLogger=DEBUG, file, console #Define console appender log4j.appender.console=org.apache.log4j.ConsoleAppender logrj.appender.console.Target=System.out log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%-5p %c{1} - %m%n #Define rolling file appender log4j.appender.file=org.apache.log4j.RollingFileAppender log4j.appender.file.File=logs/main.log log4j.appender.file.Append=true log4j.appender.file.ImmediateFlush=true log4j.appender.file.MaxFileSize=10MB log4j.appender.file.MaxBackupIndex=5 log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d %d{Z} [%t] %-5p (%F:%L) - %m%n #Define loggers log4j.logger.com.journaldev.log4j=WARN, file, console log4j.logger.com.journaldev.log4j.logic=DEBUG, file, console #setting additivity log4j.additivity.com.journaldev.log4j=false log4j.additivity.com.journaldev.log4j.logic=false" > log4j.properties echo " yarn.scheduler.capacity.maximum-applications 10000 Maximum number of applications that can be pending and running. yarn.scheduler.capacity.maximum-am-resource-percent 0.1 Maximum percent of resources in the cluster which can be used to run application masters i.e. controls number of concurrent running applications. yarn.scheduler.capacity.resource-calculator org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator The ResourceCalculator implementation to be used to compare Resources in the scheduler. The default i.e. DefaultResourceCalculator only uses Memory while DominantResourceCalculator uses dominant-resource to compare multi-dimensional resources such as Memory, CPU etc. yarn.scheduler.capacity.root.queues default The queues at the this level (root is the root queue). yarn.scheduler.capacity.root.default.capacity 100 Default queue target capacity. yarn.scheduler.capacity.root.default.user-limit-factor 1 Default queue user limit a percentage from 0.0 to 1.0. yarn.scheduler.capacity.root.default.maximum-capacity 100 The maximum capacity of the default queue. yarn.scheduler.capacity.root.default.state RUNNING The state of the default queue. State can be one of RUNNING or STOPPED. yarn.scheduler.capacity.root.default.acl_submit_applications * The ACL of who can submit jobs to the default queue. yarn.scheduler.capacity.root.default.acl_administer_queue * The ACL of who can administer jobs on the default queue. yarn.scheduler.capacity.node-locality-delay -1 Number of missed scheduling opportunities after which the CapacityScheduler attempts to schedule rack-local containers. Typically this should be set to number of racks in the cluster, this feature is disabled by default, set to -1. " > capacity-scheduler.xml #Remove tmp directory then create a new one for all nodes for curNode in `cat workers`; do ssh -n $curNode "rm -rf /tmp/$USER/hadoop; mkdir -p /tmp/$USER/hadoop" done #Make sure all ssh's finish before moving on sleep 1 #Set up the HDFS cd $ORIG_DIR hdfs getconf -namenodes hdfs namenode -format start-dfs.sh start-yarn.sh hdfs dfs -mkdir /user hdfs dfs -mkdir /user/$USER