#!/bin/bash

#############################################################################################
#
# FILENAME:    hadoop-setup.sh
#
# USAGE:       "source hadoop-setup.sh"
#
# NOTE:        This script sets multiple environment variables, so you MUST use
#              "source" to run the script. Otherwise, the shell in which you ran
#              this script will not pick up the new environment variables!
#
# DESCRIPTION: This script sets up hadoop on the ARC cluster. It assumes one master node
#              and N slave nodes, where N is the number of nodes you reserved using "srun". 
#              The master node will be the lowest numbered node in your reservation. It
#              sets up the HDFS and also creates a "/user/UNITYID" directory inside the
#              HDFS, which is where any input/output will be read/written to. To make sure
#              this script ran successfully, run "hdfs dfs -ls /user". You should see the
#              "/user/UNITYID" directory that was created. 
# 
# AUTHOR:      Tyler Stocksdale, Frank Mueller
# DATE:        10/18/2017, 3/3/2024
#
#############################################################################################

ORIG_DIR=`pwd`

#Should already be in .bashrc
#module load java
export JAVA_HOME=/usr/lib/jvm/java/

#Set up new hadoop directory in the current directory
rm -rf hadoop/
mkdir hadoop/
cd hadoop
mkdir -p etc/hadoop
mkdir bin
cd bin
ln -s /usr/local/hadoop/bin/* . 
cd ..
mkdir libexec
cd libexec
ln -s /usr/local/hadoop/libexec/* . 
cd ..
mkdir sbin
cd sbin
ln -s /usr/local/hadoop/sbin/* . 
cd ..
ln -s /usr/local/hadoop/* . >& /dev/null

#Set environment variables (These will not persist unless using "source" command!)
export HADOOP_HOME=`pwd`
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export CLASSPATH=`hadoop classpath`
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native -Dlog4j.configuration=file:$HADOOP_CONF_DIR/log.properties"

#Below files need to be put in etc/hadoop directory
cd etc/hadoop

#Create file workers
echo $SLURM_NODELIST | 
	tr -d c | 
	tr -d [ | 
	tr -d ] | 
	perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg' | 
	awk 'BEGIN { RS=","} { print "c"$1 }' > workers
	
#Create file masters
head -1 workers > masters

MASTER=$(cat masters)

#Create file core-site.xml
echo "<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://"$MASTER":9000</value>
    </property>
</configuration>" > core-site.xml

#Create file hdfs-site.xml
echo "<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:///tmp/"$USER"/hadoop/data</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:///tmp/"$USER"/hadoop/name</value>
    </property>
</configuration>" > hdfs-site.xml

#Create mapred-site.xml
echo "<configuration>
   <property>
      <name>mapreduce.framework.name</name>
      <value>yarn</value>
   </property>
   <property>
     <name>mapreduce.application.classpath</name>
     <value>"$CLASSPATH"</value>
   </property>
</configuration>" > mapred-site.xml

#Create yarn-site.xml
echo "<configuration>
        <property>
                <name>yarn.nodemanager.aux-services</name>
                <value>mapreduce_shuffle</value>
        </property>
        <property>
                <name>yarn.resourcemanager.hostname</name>
                <value>"$MASTER"</value>
        </property>
</configuration>" > yarn-site.xml

#Create hadoop-env.sh
echo "export JAVA_HOME=$JAVA_HOME
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_OPTS=\"-Djava.library.path=$HADOOP_HOME/lib/native -Dlog4j.configuration=file:$HADOOP_CONF_DIR/log4j.properties\" " > hadoop-env.sh

echo "log4j.rootLogger=ERROR, file
#Define root logger options
#log4j.rootLogger=DEBUG, file, console

#Define console appender
log4j.appender.console=org.apache.log4j.ConsoleAppender
logrj.appender.console.Target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%-5p %c{1} - %m%n

#Define rolling file appender
log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=logs/main.log
log4j.appender.file.Append=true
log4j.appender.file.ImmediateFlush=true
log4j.appender.file.MaxFileSize=10MB
log4j.appender.file.MaxBackupIndex=5
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d %d{Z} [%t] %-5p (%F:%L) - %m%n

#Define loggers
log4j.logger.com.journaldev.log4j=WARN, file, console
log4j.logger.com.journaldev.log4j.logic=DEBUG, file, console

#setting additivity
log4j.additivity.com.journaldev.log4j=false
log4j.additivity.com.journaldev.log4j.logic=false" > log4j.properties

echo "<!--
  Licensed under the Apache License, Version 2.0 (the \"License\");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an \"AS IS\" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>

  <property>
    <name>yarn.scheduler.capacity.maximum-applications</name>
    <value>10000</value>
    <description>
      Maximum number of applications that can be pending and running.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.1</value>
    <description>
      Maximum percent of resources in the cluster which can be used to run 
      application masters i.e. controls number of concurrent running
      applications.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.resource-calculator</name>
    <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
    <description>
      The ResourceCalculator implementation to be used to compare 
      Resources in the scheduler.
      The default i.e. DefaultResourceCalculator only uses Memory while
      DominantResourceCalculator uses dominant-resource to compare 
      multi-dimensional resources such as Memory, CPU etc.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.queues</name>
    <value>default</value>
    <description>
      The queues at the this level (root is the root queue).
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.capacity</name>
    <value>100</value>
    <description>Default queue target capacity.</description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
    <value>1</value>
    <description>
      Default queue user limit a percentage from 0.0 to 1.0.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
    <value>100</value>
    <description>
      The maximum capacity of the default queue. 
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.state</name>
    <value>RUNNING</value>
    <description>
      The state of the default queue. State can be one of RUNNING or STOPPED.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
    <value>*</value>
    <description>
      The ACL of who can submit jobs to the default queue.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
    <value>*</value>
    <description>
      The ACL of who can administer jobs on the default queue.
    </description>
  </property>

  <property>
    <name>yarn.scheduler.capacity.node-locality-delay</name>
    <value>-1</value>
    <description>
      Number of missed scheduling opportunities after which the CapacityScheduler 
      attempts to schedule rack-local containers. 
      Typically this should be set to number of racks in the cluster, this 
      feature is disabled by default, set to -1.
    </description>
  </property>

</configuration>" > capacity-scheduler.xml


#Remove tmp directory then create a new one for all nodes
for curNode in `cat workers`; do
  ssh -n $curNode "rm -rf /tmp/$USER/hadoop; mkdir -p /tmp/$USER/hadoop"
done

#Make sure all ssh's finish before moving on
sleep 1


#Set up the HDFS
cd $ORIG_DIR
hdfs getconf -namenodes
hdfs namenode -format
start-dfs.sh
start-yarn.sh
hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/$USER

