http://blog.csdn.net/carl810224/article/details/52160418
http://blog.leanote.com/post/[email protected]/Hadoop
%E4%BC%AA%E5%88%86%E5%B8%83%E5%BC%8F%E9%9B%86%E7%BE%A4%E6%90%AD%E5%BB%BA%EF%BC%88%E6%B5%8B%E8%AF%95%EF%BC%89



Hadoop0           JDK/Zookeeper/Hadoop   namenode/zkfc/journalnode/resourcemanager/QuoqumPeerMain
Hadoop1          JDK/Zookeeper/Hadoop                      namenode/zkfc/journalnode/resourcemanager/QuoqumPeerMain
Hadoop2        JDK/Zookeeper/Hadoop
datanode//journalnode/nodemanager/QuoqumPeerMain

wget http://mirrors.aliyun.com/apache/zookeeper/zookeeper-3.4.6/zookeeper-3.4.6.tar.gz
cd zookeeper-3.4.6/
cp conf/zoo_sample.cfg  conf/zoo.cfg
vim conf/zoo.cfg
tickTime=2000   //客户端心跳时间(毫秒)
initLimit=10     //循序心跳间隔的最大时间
syncLimit=5     //同步时限
dataDir=/usr/local/zookeeper-3.4.6/data   //数据存储目录
dataLogDir=/usr/local/zookeeper-3.4.6/data/log   //数据日志存储目录
clientPort=2181     //端口号
maxClientCnxns=2000    //连接zookeeper的最大数量
server.1=hadoop0:2888:3888     //设置zookeeper的节点
server.2=hadoop1:2888:3888
server.3=hadoop2:2888:3888

mkdir -p data/log
echo '1' > data/myid  其它两个节点后分别修改内容为2和3,以此类推
启动命令  ./bin/zkServer.sh start

修改hadoop配置文件
cd hadoop-2.7.0/etc/hadoop/
vim hadoop-env.sh
按如下内容进行配置(具体配置情况需按照生产环境和条件进行配置):
export JAVA_HOME=/home/hadoop/apache/jdk1.8.0_101    //设置jdk路径
export HADOOP_SSH_OPTS="-p 27005"   ssh端口
export HADOOP_HEAPSIZE=1024      //设置Hadoop位置文本的大小
export HADOOP_NAMENODE_OPTS="-Xmx1024m
-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS}
-Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"     
//设置Namenode内存大小,此处根据实际情况设定其大小
export HADOOP_DATANODE_OPTS="-Xmx1024m
-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS   //设置Datanode内存大小
export HADOOP_PORTMAP_OPTS="-Xmx1024m $HADOOP_PORTMAP_OPTS"   //修改至1024m

指定slaves
vi etc/hadoop/slaves
hadoop1
hadoop2

配置core-site.xml:
vim core-site.xml
<configuration>
  <!-- 指定hdfs的nameservices名称为mycluster,与hdfs-site.xml的HA配
置相同 -->
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://mycluster</value>
  </property>
<!-- 设置zookeeper集群的配置和端口 -->
  <property>
    <name>ha.zookeeper.quorum</name>
    <value> hadoop0:2181,hadoop1:2181,hadoop2:2181</value>
  </property>
  指定缓存文件存储的路径和大小(可以设置的大一些,单位:字节)
  <property>
    <name>hadoop.tmp.dir</name>
    <value> /usr/local/hadoop-2.7.0/tmp</value>
  </property>
  <property>
    <name>io.file.buffer.size</name>
    <value>131072</value>
  </property>
  <!-- 配置hdfs文件被永久删除前保留的时间(单位:分钟),默认值为0,表明垃圾回收站功能关闭 -->
  <property>
    <name>fs.trash.interval</name>
    <value>10080</value>
  </property>
</configuration>

配置hdfs-site.xml:
vim hdfs-site.xml
<configuration>
<!-- 数据备份的个数 -->
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
<!-- 关闭权限验证 -->
  <property>
    <name>dfs.permissions.enabled</name>
    <value>false</value>
  </property>
<!-- 开启WebHDFS功能(基于REST的接口服务) -->
  <property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
  </property>
 <!-- //////////////以下为HDFS HA的配置////////////// -->
  <!-- 指定hdfs的nameservices名称为mycluster -->
  <property>
    <name>dfs.nameservices</name>
    <value>mycluster</value>
  </property>
<!-- 指定mycluster的两个namenode的名称分别为nn1,nn2 -->
  <property>
    <name>dfs.ha.namenodes.mycluster</name>
    <value>nn1,nn2</value>
  </property>
 <!-- 配置nn1,nn2的rpc通信端口 -->
  <property>
    <name>dfs.namenode.rpc-address.mycluster.nn1</name>
    <value> hadoop0:8020</value>
  </property>
  <property>
    <name>dfs.namenode.rpc-address.mycluster.nn2</name>
    <value> hadoop1:8020</value>
  </property>
  <!-- 配置nn1,nn2的http通信端口 -->
  <property>
    <name>dfs.namenode.http-address.mycluster.nn1</name>
    <value>hadoop0:50070</value>
  </property>
  <property>
    <name>dfs.namenode.http-address.mycluster.nn2</name>
    <value>hadoop1:50070</value>
  </property>
  <!-- 指定namenode元数据存储在journalnode中的路径,至少3个journalnode-->
  <property>
    <name>dfs.namenode.shared.edits.dir</name>
    <value> qjournal://hadoop0:8485;hadoop1:8485;hadoop2:8485/mycluster</value>
  </property>
  <!-- 指定HDFS客户端连接active namenode的java类 -->
  <property>
    <name>dfs.client.failover.proxy.provider.mycluster</name>
  <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
  </property>
  <!-- 配置隔离机制为ssh -->
  <property>
    <name>dfs.ha.fencing.methods</name>
    <value>sshfence(spark:27005)</value>    用户名和ssh端口
  </property>
  <!-- 指定秘钥的位置 -->
  <property>
    <name>dfs.ha.fencing.ssh.private-key-files</name>
    <value>/root/.ssh/id_dsa</value>
  </property>
  <!-- 指定journalnode日志文件存储的路径 -->
  <property>
    <name>dfs.journalnode.edits.dir</name>
    <value>/usr/local/hadoop-2.7.0/tmp/journal</value>
  </property>
  <!-- 开启自动故障转移 -->
  <property>
    <name>dfs.ha.automatic-failover.enabled</name>
    <value>true</value>
  </property>
</configuration>

配置mapred-site.xml:
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<configuration>
  <!-- 指定MapReduce计算框架使用YARN -->
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <!-- 指定jobhistory server的rpc地址 -->
  <property>
    <name>mapreduce.jobhistory.address</name>
    <value>hadoop0:10020</value>
  </property>
  <!-- 指定jobhistory server的http地址 -->
  <property>
    <name>mapreduce.jobhistory.webapp.address</name>
    <value>hadoop0:19888</value>
  </property>
  <!-- 开启uber模式(针对小作业的优化) -->
  <property>
    <name>mapreduce.job.ubertask.enable</name>
    <value>true</value>
  </property>
  <!-- 配置启动uber模式的最大map数 -->
  <property>
    <name>mapreduce.job.ubertask.maxmaps</name>
    <value>3</value>
  </property>
  <!-- 配置启动uber模式的最大reduce数 -->
  <property>
    <name>mapreduce.job.ubertask.maxreduces</name>
    <value>1</value>
  </property>
</configuration>

配置yarn-site.xml文件:
vim yarn-site.xml
<configuration>
NodeManager上运行的附属服务,需配置成mapreduce_shuffle才可运行MapReduce程序
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <!-- 配置Web Application Proxy安全代理(防止yarn被攻击) -->
  <property>
    <name>yarn.web-proxy.address</name>
    <value>hadoop1:8888</value>
  </property>
  <!-- 开启日志 -->
  <property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
  </property>
  <!-- 配置nodemanager可用的资源内存 -->
  <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>4096</value>
  </property>
  <!-- 配置nodemanager可用的资源CPU -->
  <property>
    <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>4</value>
  </property>
  <!-- //////////////以下为YARN HA的配置////////////// -->
  <!-- 开启YARN HA -->
  <property>
    <name>yarn.resourcemanager.ha.enabled</name>
    <value>true</value>
  </property>
  <!-- 启用自动故障转移 -->
  <property>
    <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
    <value>true</value>
  </property>
  <!-- 指定YARN HA的名称 -->
  <property>
    <name>yarn.resourcemanager.cluster-id</name>
    <value>yarncluster</value>
  </property>
  <!-- 指定两个resourcemanager的名称 -->
  <property>
    <name>yarn.resourcemanager.ha.rm-ids</name>
    <value>rm1,rm2</value>
  </property>
  <!-- 配置rm1,rm2的主机 -->
  <property>
    <name>yarn.resourcemanager.hostname.rm1</name>
    <value>hadoop0</value>
  </property>
  <property>
    <name>yarn.resourcemanager.hostname.rm2</name>
    <value>hadoop1</value>
  </property>

  <!-- 配置YARN的http端口 -->
  <property>
    <name>yarn.resourcemanager.webapp.address.rm1</name>
    <value>hadoop0:8088</value>
  </property>
  <property>
    <name>yarn.resourcemanager.webapp.address.rm2</name>
    <value>hadoop1:8088</value>
  </property>
  <!-- 配置zookeeper的地址 -->
  <property>
    <name>yarn.resourcemanager.zk-address</name>
    <value>hadoop0:2181</value>
  </property>
  <!-- 配置zookeeper的存储位置 -->
  <property>
    <name>yarn.resourcemanager.zk-state-store.parent-path</name>
    <value>/rmstore</value>
  </property>
  <!-- 开启yarn resourcemanager restart -->
  <property>
    <name>yarn.resourcemanager.recovery.enabled</name>
    <value>true</value>
  </property>
  <!-- 配置resourcemanager的状态存储到zookeeper中 -->
  <property>
    <name>yarn.resourcemanager.store.class</name>
    <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
  </property>
  <!-- 开启yarn nodemanager restart -->
  <property>
    <name>yarn.nodemanager.recovery.enabled</name>
    <value>true</value>
  </property>
  <!-- 配置nodemanager IPC的通信端口 -->
  <property>
    <name>yarn.nodemanager.address</name>
    <value>0.0.0.0:45454</value>
  </property>
</configuration>

Hadoop集群初始化
zookeeper
echo 1 > /usr/local/zookeeper-3.4.6/data/myid
echo 2 > /usr/local/zookeeper-3.4.6/data/myid
echo 3 > /usr/local/zookeeper-3.4.6/data/myid

在所有节点上启动zookeeper集群:cd /usr/local/zookeeper-3.4.6/ && ./bin/zkServer.sh start
            查看zookeeper状态:  ./bin/zkServer.sh status

在hadoop0上格式化zkfc:     ./bin/hdfs zkfc -formatZK
在所有节点上启动journalnode:./sbin/hadoop-daemon.sh start journalnode

在Hadoop0 格式化hdfs   ./bin/hdfs namenode -format
将格式化后hadoop0节点nameode元数据目录复制到hadoop1节点
scp -r tmp/dfs hadoop1:/usr/local/hadoop-2.7.0/tmp/

启动hadoop集群
先每个节点ssh一般,接受key
在hadoop0启动dfs:./sbin/start-dfs.sh
会开启以下进程:
namenode                              (hadoop0,hadoop1)
journalnode                            (hadoop0,hadoop1,hadoop2)
DFSZKFailoverController                             (hadoop0,hadoop1)
datanode                                (hadoop1,hadoop2)  slave文件指定的

在hadoop1启动YARN  ./sbin/start-yarn.sh
执行后在hadoop1启动ResourceManager  (hadoop1,hadoop2) 启动NodeManager

在hadoop0启动容灾的ResourceManager  ./sbin/yarn-daemon.sh start resourcemanager

在hadoop1启动YARN的安全代理
yarn-daemon.sh start proxyserver
注:proxyserver充当防火墙的角色,提高访问集群的安全性

在hadoop0启动YARN的历史任务 ./sbin/mr-jobhistory-daemon.sh start historyserver
查看namenode状态  http://hadoop0:50070     http://hadoop1:50070

results matching ""

    No results matching ""