http://blog.csdn.net/carl810224/article/details/52160418
http://blog.leanote.com/post/[email protected]/Hadoop
%E4%BC%AA%E5%88%86%E5%B8%83%E5%BC%8F%E9%9B%86%E7%BE%A4%E6%90%AD%E5%BB%BA%EF%BC%88%E6%B5%8B%E8%AF%95%EF%BC%89
Hadoop0 JDK/Zookeeper/Hadoop namenode/zkfc/journalnode/resourcemanager/QuoqumPeerMain
Hadoop1 JDK/Zookeeper/Hadoop namenode/zkfc/journalnode/resourcemanager/QuoqumPeerMain
Hadoop2 JDK/Zookeeper/Hadoop
datanode//journalnode/nodemanager/QuoqumPeerMain
wget http://mirrors.aliyun.com/apache/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz
cd zookeeper-3.4.6/
cp conf/zoo_sample.cfg conf/zoo.cfg
vim conf/zoo.cfg
tickTime=2000 //客户端心跳时间(毫秒)
initLimit=10 //循序心跳间隔的最大时间
syncLimit=5 //同步时限
dataDir=/usr/local/zookeeper-3.4.6/data //数据存储目录
dataLogDir=/usr/local/zookeeper-3.4.6/data/log //数据日志存储目录
clientPort=2181 //端口号
maxClientCnxns=2000 //连接zookeeper的最大数量
server.1=hadoop0:2888:3888 //设置zookeeper的节点
server.2=hadoop1:2888:3888
server.3=hadoop2:2888:3888
mkdir -p data/log
echo '1' > data/myid 其它两个节点后分别修改内容为2和3,以此类推
启动命令 ./bin/zkServer.sh start
修改hadoop配置文件
cd hadoop-2.7.0/etc/hadoop/
vim hadoop-env.sh
按如下内容进行配置(具体配置情况需按照生产环境和条件进行配置):
export JAVA_HOME=/home/hadoop/apache/jdk1.8.0_101 //设置jdk路径
export HADOOP_SSH_OPTS="-p 27005" ssh端口
export HADOOP_HEAPSIZE=1024 //设置Hadoop位置文本的大小
export HADOOP_NAMENODE_OPTS="-Xmx1024m
-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS}
-Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
//设置Namenode内存大小,此处根据实际情况设定其大小
export HADOOP_DATANODE_OPTS="-Xmx1024m
-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS //设置Datanode内存大小
export HADOOP_PORTMAP_OPTS="-Xmx1024m $HADOOP_PORTMAP_OPTS" //修改至1024m
指定slaves
vi etc/hadoop/slaves
hadoop1
hadoop2
配置core-site.xml:
vim core-site.xml
<configuration>
<!-- 指定hdfs的nameservices名称为mycluster,与hdfs-site.xml的HA配
置相同 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 设置zookeeper集群的配置和端口 -->
<property>
<name>ha.zookeeper.quorum</name>
<value> hadoop0:2181,hadoop1:2181,hadoop2:2181</value>
</property>
指定缓存文件存储的路径和大小(可以设置的大一些,单位:字节)
<property>
<name>hadoop.tmp.dir</name>
<value> /usr/local/hadoop-2.7.0/tmp</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131072</value>
</property>
<!-- 配置hdfs文件被永久删除前保留的时间(单位:分钟),默认值为0,表明垃圾回收站功能关闭 -->
<property>
<name>fs.trash.interval</name>
<value>10080</value>
</property>
</configuration>
配置hdfs-site.xml:
vim hdfs-site.xml
<configuration>
<!-- 数据备份的个数 -->
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<!-- 关闭权限验证 -->
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<!-- 开启WebHDFS功能(基于REST的接口服务) -->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!-- //////////////以下为HDFS HA的配置////////////// -->
<!-- 指定hdfs的nameservices名称为mycluster -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 指定mycluster的两个namenode的名称分别为nn1,nn2 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!-- 配置nn1,nn2的rpc通信端口 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value> hadoop0:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value> hadoop1:8020</value>
</property>
<!-- 配置nn1,nn2的http通信端口 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>hadoop0:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>hadoop1:50070</value>
</property>
<!-- 指定namenode元数据存储在journalnode中的路径,至少3个journalnode-->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value> qjournal://hadoop0:8485;hadoop1:8485;hadoop2:8485/mycluster</value>
</property>
<!-- 指定HDFS客户端连接active namenode的java类 -->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制为ssh -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence(spark:27005)</value> 用户名和ssh端口
</property>
<!-- 指定秘钥的位置 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_dsa</value>
</property>
<!-- 指定journalnode日志文件存储的路径 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/local/hadoop-2.7.0/tmp/journal</value>
</property>
<!-- 开启自动故障转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
配置mapred-site.xml:
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<configuration>
<!-- 指定MapReduce计算框架使用YARN -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 指定jobhistory server的rpc地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop0:10020</value>
</property>
<!-- 指定jobhistory server的http地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop0:19888</value>
</property>
<!-- 开启uber模式(针对小作业的优化) -->
<property>
<name>mapreduce.job.ubertask.enable</name>
<value>true</value>
</property>
<!-- 配置启动uber模式的最大map数 -->
<property>
<name>mapreduce.job.ubertask.maxmaps</name>
<value>3</value>
</property>
<!-- 配置启动uber模式的最大reduce数 -->
<property>
<name>mapreduce.job.ubertask.maxreduces</name>
<value>1</value>
</property>
</configuration>
配置yarn-site.xml文件:
vim yarn-site.xml
<configuration>
NodeManager上运行的附属服务,需配置成mapreduce_shuffle才可运行MapReduce程序
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 配置Web Application Proxy安全代理(防止yarn被攻击) -->
<property>
<name>yarn.web-proxy.address</name>
<value>hadoop1:8888</value>
</property>
<!-- 开启日志 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 配置nodemanager可用的资源内存 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<!-- 配置nodemanager可用的资源CPU -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>4</value>
</property>
<!-- //////////////以下为YARN HA的配置////////////// -->
<!-- 开启YARN HA -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 启用自动故障转移 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 指定YARN HA的名称 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarncluster</value>
</property>
<!-- 指定两个resourcemanager的名称 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 配置rm1,rm2的主机 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>hadoop0</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>hadoop1</value>
</property>
<!-- 配置YARN的http端口 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>hadoop0:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>hadoop1:8088</value>
</property>
<!-- 配置zookeeper的地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>hadoop0:2181</value>
</property>
<!-- 配置zookeeper的存储位置 -->
<property>
<name>yarn.resourcemanager.zk-state-store.parent-path</name>
<value>/rmstore</value>
</property>
<!-- 开启yarn resourcemanager restart -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 配置resourcemanager的状态存储到zookeeper中 -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 开启yarn nodemanager restart -->
<property>
<name>yarn.nodemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 配置nodemanager IPC的通信端口 -->
<property>
<name>yarn.nodemanager.address</name>
<value>0.0.0.0:45454</value>
</property>
</configuration>
Hadoop集群初始化
zookeeper
echo 1 > /usr/local/zookeeper-3.4.6/data/myid
echo 2 > /usr/local/zookeeper-3.4.6/data/myid
echo 3 > /usr/local/zookeeper-3.4.6/data/myid
在所有节点上启动zookeeper集群:cd /usr/local/zookeeper-3.4.6/ && ./bin/zkServer.sh start
查看zookeeper状态: ./bin/zkServer.sh status
在hadoop0上格式化zkfc: ./bin/hdfs zkfc -formatZK
在所有节点上启动journalnode:./sbin/hadoop-daemon.sh start journalnode
在Hadoop0 格式化hdfs ./bin/hdfs namenode -format
将格式化后hadoop0节点nameode元数据目录复制到hadoop1节点
scp -r tmp/dfs hadoop1:/usr/local/hadoop-2.7.0/tmp/
启动hadoop集群
先每个节点ssh一般,接受key
在hadoop0启动dfs:./sbin/start-dfs.sh
会开启以下进程:
namenode (hadoop0,hadoop1)
journalnode (hadoop0,hadoop1,hadoop2)
DFSZKFailoverController (hadoop0,hadoop1)
datanode (hadoop1,hadoop2) slave文件指定的
在hadoop1启动YARN ./sbin/start-yarn.sh
执行后在hadoop1启动ResourceManager (hadoop1,hadoop2) 启动NodeManager
在hadoop0启动容灾的ResourceManager ./sbin/yarn-daemon.sh start resourcemanager
在hadoop1启动YARN的安全代理
yarn-daemon.sh start proxyserver
注:proxyserver充当防火墙的角色,提高访问集群的安全性
在hadoop0启动YARN的历史任务 ./sbin/mr-jobhistory-daemon.sh start historyserver
查看namenode状态 http://hadoop0:50070 http://hadoop1:50070