在 Corporation 分类下的文章

线上HAProxy内核参数调优分享

HAProxy
CPU: 8核
内存: 16G
数量:4

Servers
数量: 150
类型:HTTP/HTTPS响应GET/POST请求,返回json数据并产生日志
稳定支持的并发会话数量:400K

系统相关配置
# grep -E 'maxconn|nbproc' /etc/haproxy/haproxy.cfg

maxconn     200000
nbproc           7

# cat /etc/security/limits.d/90-nproc.conf

# Default limit for number of user's processes to prevent
# accidental fork bombs.
# See rhbz #432903 for reasoning.

*          -    nproc     4096
root       -    nproc     unlimite

# cat /etc/security/limits.d/90-nofile.conf

*          -    nofile     200000

# cat /etc/sysctl.conf

# Kernel sysctl configuration file for Red Hat Linux
#
# For binary values, 0 is disabled, 1 is enabled.  See sysctl(8) and
# sysctl.conf(5) for more details.

# Controls IP packet forwarding
net.ipv4.ip_forward = 1
net.ipv4.ip_nonlocal_bind = 1

# Controls source route verification
net.ipv4.conf.default.rp_filter = 0

# Do not accept source routing
net.ipv4.conf.default.accept_source_route = 0

# Controls the System Request debugging functionality of the kernel
kernel.sysrq = 0

# Controls whether core dumps will append the PID to the core filename.
# Useful for debugging multi-threaded applications.
kernel.core_uses_pid = 1

# Controls the use of TCP syncookies
net.ipv4.tcp_syncookies = 1

# Disable netfilter on bridges.
net.bridge.bridge-nf-call-ip6tables = 0
net.bridge.bridge-nf-call-iptables = 0
net.bridge.bridge-nf-call-arptables = 0

# Controls the maximum size of a message, in bytes
kernel.msgmnb = 65536

# Controls the default maxmimum size of a mesage queue
kernel.msgmax = 65536

# Controls the maximum shared segment size, in bytes
kernel.shmmax = 68719476736

# Controls the maximum number of shared memory segments, in pages
kernel.shmall = 4294967296

# Maximize ephemeral port range
net.ipv4.ip_local_port_range = 1024 65535

# ARP related
net.ipv4.conf.all.arp_notify = 1
net.ipv4.conf.default.arp_ignore = 1
net.ipv4.conf.default.arp_announce = 2

# General gigabit tuning
net.core.somaxconn = 32768
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.core.rmem_default = 16777216
net.core.wmem_default = 16777216
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 87380 16777216
net.ipv4.tcp_mem = 94500000 915000000 927000000

# Give the kernel more memory for tcp
# which need with many (100k+) open socket connections
net.core.netdev_max_backlog = 262144
net.ipv4.tcp_max_syn_backlog = 262144
net.ipv4.tcp_max_tw_buckets = 2000000
net.ipv4.tcp_tw_recycle = 1
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_timestamps = 0
net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_fin_timeout = 30
net.ipv4.tcp_keepalive_probes = 5
net.ipv4.tcp_keepalive_intvl = 30
net.ipv4.tcp_keepalive_time = 1800
net.ipv4.tcp_slow_start_after_idle = 0

## Protect against tcp time-wait assassination hazards
## drop RST packets for sockets in the time-wait state
net.ipv4.tcp_rfc1337 = 1

# Enusre that immediatly subsequent connections use the new values
net.ipv4.route.flush = 1

# Increase system file descriptor limit
fs.file-max = 200000
kernel.pid_max = 65536

# Limit number of orphans, each orphan can eat up to 16M (max wmem) of unswappable memory
net.ipv4.tcp_max_orphans = 60000
net.ipv4.tcp_synack_retries = 3
net.ipv4.tcp_syn_retries = 3

No Comments

Hadoop运维笔记 之 CDH5.0.0升级到CDH5.3.0

参考资料:
Hadoop: http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_ig_earlier_cdh5_upgrade.html?scroll=topic_8
Oozie: http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_ig_oozie_upgrade.html
Hive: http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_ig_hive_upgrade.html
Pig: http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_ig_pig_upgrade.html

1. 在所有Hadoop服务器上停止Monit(我们线上使用了Monit来监听进程)
登录idc2-admin1(我们线上使用了idc2-admin1作为管理机以及Yum repo服务器)
# mkdir /root/cdh530_upgrade_from_500
# cd /root/cdh530_upgrade_from_500
# pssh -i -h idc2-hnn-rm-hive 'service monit stop'
# pssh -i -h idc2-hmr.active 'service monit stop'

2. 确认本地的CDH5.3.0的Yum repo服务器已经就绪
http://idc2-admin1/repo/cdh/5.3.0/
http://idc2-admin1/repo/cloudera-gplextras5.3.0/

3. 在Ansible中更新相应的repo模板(我们线上使用了Ansible作为配置管理工具)

{% if "idc2" in group_names %}

...

{% if "cdh5-all" in group_names %}
[heylinux.el6.cloudera-cdh5.3.0]
name= el6 yum cloudera cdh5.3.0
baseurl=http://idc2-admin1/repo/cdh/5.3.0
enabled=1
gpgcheck=0

[heylinux.el6.cloudera-gplextras5.3.0]
name= el6 yum cloudera gplextras5.3.0
baseurl=http://idc2-admin1/repo/cloudera-gplextras5.3.0
enabled=1
gpgcheck=0
{% else %}

...

{% endif %}

4. 更新所有Hadoop服务器的repo文件(/etc/yum.repos.d/heylinux.repo)
# ansible-playbook --private-key /path/to/key_root -u root --vault-password-file=/path/to/vault_passwd.file base.yml -i hosts.idc2 --tags localrepos --limit cdh5-all

5. 升级HDFS相关内容
5.1. 获取当前的Activie Namenode(我们在线上的DNS服务器中创建了一个始终检查并指向Active Namenode的CNAME)
# host active-idc2-hnn
active-idc2-hnn.heylinux.com is an alias for idc2-hnn2.heylinux.com
idc2-hnn2.heylinux.com has address 172.16.2.12

5.2. 在Active NameNode上进入safe mode并生成新的fsimage,并等待整个过程结束。
# sudo -u hdfs hdfs dfsadmin -safemode enter
# sudo -u hdfs hdfs dfsadmin -saveNamespace

5.3 关闭所有的Hadoop服务
回到idc2-admin1上的工作目录
# cd /root/cdh530_upgrade_from_500

首先通过pssh批量关闭Namenode,ResourceManager以及Hive服务器上的Hadoop相关进程(将对应的服务器地址或主机名列表写入到idc2-hnn-rm-hive与idc2-hmr.active)
# pssh -i -h idc2-hnn-rm-hive 'for x in `cd /etc/init.d ; ls hadoop-*` ; do sudo service $x status ; done'
# pssh -i -h idc2-hmr.active 'for x in `cd /etc/init.d ; ls hadoop-*` ; do sudo service $x status ; done'

# pssh -i -h idc2-hnn-rm-hive 'for x in `cd /etc/init.d ; ls hadoop-*` ; do sudo service $x stop ; done'
# pssh -i -h idc2-hmr.active 'for x in `cd /etc/init.d ; ls hadoop-*` ; do sudo service $x stop ; done'

# 检查如果存在与新版本相冲突的libhadoop.so文件,如果存在则删除(我们线上安装了Snappy,它会自己生成一个与CDH5.3.0自带的libhadoop.so相冲突的文件并放置到当前的JDK lib目录下面)。
# pssh -i -h idc2-hnn-rm-hive 'rm -f /usr/java/jdk1.7.0_45/jre/lib/amd64/libhadoop.so'
# pssh -i -h idc2-hmr.active 'rm -f /usr/java/jdk1.7.0_45/jre/lib/amd64/libhadoop.so'
Backup the HDFS metadata on the NameNodes

在Namenodes上备份metadata文件(我们线上有两个Namenode组成的HA,分别为idc2-hnn1与idc2-hnn2:
# mkdir /root/cdh530upgrade
# cd /root/cdh530upgrade
# tar -cf /root/nn_backup_data.data1.`date +%Y%m%d`.tar /data1/dfs/nn
# tar -cf /root/nn_backup_data.data2.`date +%Y%m%d`.tar /data2/dfs/nn

6. 升级Hadoop相关软件包
登录并升级Hive服务器idc2-hive1
# yum clean all; yum upgrade hadoop

登录并升级ResourceManager服务器idc2-rm1与idc2-rm2
# yum clean all; yum upgrade hadoop

回到idc2-admin1并升级所有的Datanode服务器idc2-hmr*
# pssh -i -h idc2-hmr.active 'yum clean all; yum upgrade hadoop hadoop-lzo -y'

登录并升级idc2-hnn1(Standby Namenode,由之前的host active-idc2-hnn命令判断得来)
# yum clean all; yum upgrade hadoop hadoop-lzo

登录并升级idc2-hnn2(Active Namenode,由之前的host active-idc2-hnn命令判断得来)
# yum clean all; yum upgrade hadoop hadoop-lzo

回到idc2-admin1并升级所有的Hadoop Clients
# pssh -i -h idc2-client 'yum clean all; yum upgrade hadoop -y'

7. 启动相关服务
登录并启动Journal Nodes服务(我们线上为idc2-hnn1, idc2-hnn2, idc2-rm1三台服务器)
# service hadoop-hdfs-journalnode start

登录所有的DataNode并启动服务(我们线上为idc2-hmr*服务器)
# service hadoop-hdfs-datanode start

登录Active NameNode并更新HDFS Metadata
# service hadoop-hdfs-namenode upgrade
# tailf /var/log/hadoop/hadoop-hdfs-namenode-`hostname -s`.heylinux.com.log

一直等待直到整个过程结束,例如在Log中出现如下类似内容:
/var/lib/hadoop-hdfs/cache/hadoop/dfs/<name> is complete.

等待直至NameNode退出Safe Mode,然后重启Standby NameNode

登录Standby NameNode并重启服务
# sudo -u hdfs hdfs namenode -bootstrapStandby
# service hadoop-hdfs-namenode start

登录所有的ResourceManager并启动服务
# service hadoop-yarn-resourcemanager start

登录所有的NodeManager并启动服务(我们线上为idc2-hmr*服务器)
# service hadoop-yarn-nodemanager start

在Active ResourceManager上启动HistoryServer(我们线上为idc2-rm1服务器)
# service hadoop-mapreduce-historyserver start

至此,整个Hadoop相关的升级就结束了,下面,将对Hive,Oozie和Pig的升级做相应的介绍。

8. 升级Hive与Oozie服务器(我们线上统一安装到了一台机器idc2-hive1)
8.1 升级Hive服务器
备份Metastore数据库
# mkdir -p /root/backupfiles/hive
# cd /root/backupfiles/hive
# mysqldump -uoozie -pheylinux metastore > metastore.sql.bak.`date +%Y%m%d`

更新hive-site.xml

Confirm the following settings are present in hive-site.xml
<property>
  <name>datanucleus.autoCreateSchema</name>
  <value>false</value>
</property>
  <property>
  <name>datanucleus.fixedDatastore</name>
  <value>true</value>
</property>

停止Hive相关服务
# service hive-server2 stop
# service hive-metastore stop

升级Hive相关软件包
# yum upgrade hive hive-metastore hive-server2 hive-jdbc
# yum install hive-hbase hive-hcatalog hive-webhcat

升级Hive的Metastore
# sudo -u oozie /usr/lib/hive/bin/schematool -dbType mysql -upgradeSchemaFrom 0.12.0

启动Hive服务
# service hive-metastore start
# service hive-server2 start

8.2 升级Oozie服务器
备份Oozie数据库
# mkdir -p /root/backupfiles/hive
# cd /root/backupfiles/hive
# mysqldump -uoozie -pheylinux oozie > oozie.sql.bak.`date +%Y%m%d`

备份Oozie配置文件
# tar cf oozie.conf.bak.`date +%Y%m%d` /etc/oozie/conf

停止Oozie
# service oozie stop

升级Oozie软件包
# yum upgrade oozie oozie-client

仔细校对新的配置文件中与原有配置文件中的参数,并将原有配置文件中的参数更新到新的配置文件

备份Oozie lib目录
# tar cf oozie.lib.bak.`date +%Y%m%d` /var/lib/oozie

升级Oozie数据库
# sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh upgrade -run

升级Oozie Shared Library
# sudo -u oozie hadoop fs -mv /user/oozie/share /user/oozie/share.orig.`date +%Y%m%d`
# sudo oozie-setup sharelib create -fs hdfs://idc1-hnn2:8020 -locallib /usr/lib/oozie/oozie-sharelib-yarn.tar.gz

将所有的library从目录/user/oozie/share/lib/lib_<new_date_string>移动到/user/oozie/share/lib(<new_date_string>为目录生成后上面的时间戳)
# sudo -u oozie mv /user/oozie/share/lib/lib_<new_date_string>/* /user/oozie/share/lib/

检查HDFS中/user/oozie/share目录下的所有文件,并与备份后的share.orig.`date +%Y%m%d`中的文件进行一一对比,除了带有"cdh5"版本字样的软件包仅保留更新的以外,其它的都复制到新的lib目录下。

启动Oozie服务器
# service oozie start

9. 升级Pig
杀掉所有正在运行的Pig进程
# pkill -kill -f pig

更新Pig软件包
# yum upgrade pig

10. 在所有的软件包都升级完毕,并且HDFS也能正常工作的情况下,执行finalizeUpgrade命令做最后的收尾
登录Active Namenode并执行以下命令
# sudo -u hdfs hdfs dfsadmin -finalizeUpgrade

, , , , , ,

No Comments

Chronic Daily Headache

这几天加班加点的完成了一项关于CDH5的紧急任务,领导今天回复邮件:
Good job guys, you save me from having Chronic Daily Headache :)

英文的双关语与中文的双关语有着异曲同工之妙,中文通常是取“音”,英文也可以取“音”,但更多的是取“形”,比如上面的 Chronic Daily Headache 就与 CDH 双关。

PS:被表扬的感觉真不错,呵呵。

2 Comments

使用Ansible自动安装部署CDH5集群

背景介绍:
之前,就CDH3和CDH4的安装部署,我写过两个系列的文章:
http://heylinux.com/archives/1980.html
http://heylinux.com/archives/2827.html

但随着CDH5的发展,尤其是在Namenode的高可用方面的成熟,项目上决定将现有的Hadoop集群都迁移到CDH5上。
为了方便管理,我采用了Ansible来自动安装部署整个CDH5集群,目前在线上运行状况良好。

相关配置:
所有的配置,我都放到了GitHub中,对应的仓库地址如下:
https://github.com/mcsrainbow/ansible-playbooks-cdh5

,

2 Comments

在Linux上安装配置BitTorrent Sync

背景介绍:
目前我们线上的前端服务器数量比较多,超过200多台,每次发布新应用的时候,都是将软件包放在一台专门的Push服务器上,再由所有的前端服务器通过rsync自动同步。但随着前端服务器的数量越来越多,Push服务器的带宽已经成为了瓶颈。
而BitTorrent Sync这种P2P方式的同步则是一种解决方案。同时它的跨平台支持也非常好,无论是Windows,Linux,Mac OS,甚至手机端都有相应的客户端。虽然目前尚未开源,但可以免费使用,还是很不错的。

下面,就是我们在线上的纯Linux测试环境中的安装与配置步骤:
Servers:
idc2-server1,idc2-server2,idc2-server3

1. 下载BitTorrent Sync,在所有服务器上:
$ sudo wget http://download-lb.utorrent.com/endpoint/btsync/os/linux-x64/track/stable -O /tmp/btsync_x64.tar.gz
$ sudo mkdir /opt/btsync
$ cd /opt/btsync
$ sudo tar xzf /tmp/btsync_x64.tar.gz

2. 创建服务管理脚本,在所有服务器上:
$ sudo vim /etc/init.d/btsync

 
#!/bin/sh
#
# description: starts and stops the btsync client

CONF=/opt/btsync/btsync.cfg
PROC=/opt/btsync/btsync
PIDFILE=/opt/btsync/btsync.pid

start() {
  PID1=$(pidof btsync)
  if [ -z ${PID1} ]; then
    echo -n "Starting BitTorrent Sync: "
    ${PROC} --config ${CONF}
  else
    echo "BitTorrent Sync is already running at pid:${PID1}"
  fi
  return $?
}  

stop() {
  echo -n "Stopping BitTorrent Sync: "
  PID1=$(pidof btsync)
  if [ ! -z ${PID1} ]; then
    kill -9 ${PID1}
    echo "OK"
  else
    echo "Failed"
  fi
  return $?
}  

status() {
  PID1=$(pidof btsync)
  PID2=$(cat ${PIDFILE}) 
  echo -n "Checking BitTorrent Sync: "
  if [ ! -z ${PID1} ] && [ "${PID1}" -eq "${PID2}" ]; then
    echo "OK"
  else
    echo "Failed"
  fi
  return $?
}  

case "$1" in
  start)
   start
  ;;
  stop)
    stop
  ;;
  restart)
    stop
    sleep 1
    start
  ;;
  status)
    status
  ;;
  *)
    echo $"Usage: $0 {start|stop|restart|status}"
    exit 2
esac

$ sudo chmod +x /etc/init.d/btsync

3. 创建用于同步的目录,在所有服务器上:
$ sudo mkdir /opt/btsync_transfer

4. 创建配置文件,在idc2-server1上:
[heydevops@idc2-server1 btsync]$ sudo vim /opt/btsync/btsync.cfg

 
{ 
  "device_name": "idc2-server1",
  "listening_port" : 8889, // 0 - randomize port

  "check_for_updates" : false,
  "use_upnp" : false,

  "storage_path" : "/opt/btsync",
  "pid_file" : "/opt/btsync/btsync.pid",

  "download_limit" : 0, // 0 - no limit
  "upload_limit" : 0, 

  "webui" :
  {
    "listen" : "0.0.0.0:8888",
    "login" : "admin",
    "password" : "btsync"
  }

  ,
  "folder_rescan_interval" : 60,
  "lan_encrypt_data" : false,
  "lan_use_tcp" : true
}

5. 创建同步所需的密钥,在idc2-server1上:
$ sudo /etc/init.d/btsync start

打开Web UI:http://idc2-server1:8888
用户名: admin
密码: btsync

点击 "Add Folder",
在 "Path" 中输入 "/opt/btsync_transfer"
点击 "Generate" 得到 "Secret" 为 "ALUORWDEWOLV354ZHPHFT4TSQO67JWQAN"
如下图所示:
2
阅读全文 »

,

6 Comments

Hadoop集群(CHD4)实践之 (5) Sqoop安装

目录结构
Hadoop集群(CDH4)实践之 (0) 前言
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建
Hadoop集群(CDH4)实践之 (3) Hive搭建
Hadoop集群(CHD4)实践之 (4) Oozie搭建
Hadoop集群(CHD4)实践之 (5) Sqoop安装

本文内容
Hadoop集群(CHD4)实践之 (5) Sqoop安装

参考资料
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html

环境准备
OS: CentOS 6.4 x86_64
Servers:
hadoop-master: 172.17.20.230 内存10G
- namenode
- hbase-master

hadoop-secondary: 172.17.20.234 内存10G
- secondarybackupnamenode,jobtracker
- hive-server,hive-metastore
- oozie
- sqoop

hadoop-node-1: 172.17.20.231 内存10G sudo yum install hbase-regionserver
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-2: 172.17.20.232 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-3: 172.17.20.233 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

对以上角色做一些简单的介绍:
namenode - 整个HDFS的命名空间管理服务
secondarynamenode - 可以看做是namenode的冗余服务
jobtracker - 并行计算的job管理服务
datanode - HDFS的节点服务
tasktracker - 并行计算的job执行服务
hbase-master - Hbase的管理服务
hbase-regionServer - 对Client端插入,删除,查询数据等提供服务
zookeeper-server - Zookeeper协作与配置管理服务
hive-server - Hive的管理服务
hive-metastore - Hive的元存储,用于对元数据进行类型检查与语法分析
oozie - Oozie是一种Java Web应用程序,用于工作流的定义和管理
sqoop - Sqoop是一个转换工具,用于在关系型数据库与HDFS之间进行数据转换

本文定义的规范,避免在配置多台服务器上产生理解上的混乱:
以下操作都只需要在 Sqoop 所在主机,即 hadoop-secondary 上执行。

1. 安装前的准备
Hadoop集群(CHD4)实践之 (4) Oozie搭建

2. 安装Sqoop
$ sudo yum install sqoop sqoop-metastore

3. 启动Sqoop Metastore
$ sudo service sqoop-metastore start

4. 配置JDBC驱动
MySQL JDBC Driver:
$ sudo yum install mysql-connector-java
$ sudo ln -s /usr/share/java/mysql-connector-java.jar /usr/lib/sqoop/lib/mysql-connector-java.jar

Microsoft SQL Server JDBC Driver:
$ wget http://download.microsoft.com/download/0/2/A/02AAE597-3865-456C-AE7F-613F99F850A8/sqljdbc_4.0.2206.100_enu.tar.gz
$ tar xzvf sqljdbc_4.0.2206.100_enu.tar.gz
$ sudo cp sqljdbc_4.0/enu/sqljdbc4.jar /usr/lib/sqoop/lib/

5. 配置HCAT_HOME
$ sudo vim /etc/profile.d/sqoop.sh

 
export HCAT_HOME=/var/lib/sqoop

$ source /etc/profile

6. 至此,Sqoop的安装就已经完成。

,

4 Comments

Hadoop集群(CHD4)实践之 (4) Oozie搭建

目录结构
Hadoop集群(CDH4)实践之 (0) 前言
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建
Hadoop集群(CDH4)实践之 (3) Hive搭建
Hadoop集群(CHD4)实践之 (4) Oozie搭建
Hadoop集群(CHD4)实践之 (5) Sqoop安装

本文内容
Hadoop集群(CHD4)实践之 (4) Oozie搭建

参考资料
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html

环境准备
OS: CentOS 6.4 x86_64
Servers:
hadoop-master: 172.17.20.230 内存10G
- namenode
- hbase-master

hadoop-secondary: 172.17.20.234 内存10G
- secondarybackupnamenode,jobtracker
- hive-server,hive-metastore
- oozie

hadoop-node-1: 172.17.20.231 内存10G sudo yum install hbase-regionserver
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-2: 172.17.20.232 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-3: 172.17.20.233 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

对以上角色做一些简单的介绍:
namenode - 整个HDFS的命名空间管理服务
secondarynamenode - 可以看做是namenode的冗余服务
jobtracker - 并行计算的job管理服务
datanode - HDFS的节点服务
tasktracker - 并行计算的job执行服务
hbase-master - Hbase的管理服务
hbase-regionServer - 对Client端插入,删除,查询数据等提供服务
zookeeper-server - Zookeeper协作与配置管理服务
hive-server - Hive的管理服务
hive-metastore - Hive的元存储,用于对元数据进行类型检查与语法分析
oozie - Oozie是一种Java Web应用程序,用于工作流的定义和管理

本文定义的规范,避免在配置多台服务器上产生理解上的混乱:
以下操作都只需要在 Oozie 所在主机,即 hadoop-secondary 上执行。

1. 安装前的准备
Hadoop集群(CDH4)实践之 (3) Hive搭建

2. 安装Oozie
$ sudo yum install oozie oozie-client

3. 创建Oozie数据库
$ mysql -uroot -phiveserver

 
mysql> create database oozie;
mysql> grant all privileges on oozie.* to 'oozie'@'localhost' identified by 'oozie';
mysql> grant all privileges on oozie.* to 'oozie'@'%' identified by 'oozie';
mysql> exit;

4.配置oozie-site.xml
$ sudo vim /etc/oozie/conf/oozie-site.xml

 
<?xml version="1.0"?>
<configuration>
    <property>
        <name>oozie.service.ActionService.executor.ext.classes</name>
        <value>
            org.apache.oozie.action.email.EmailActionExecutor,
            org.apache.oozie.action.hadoop.HiveActionExecutor,
            org.apache.oozie.action.hadoop.ShellActionExecutor,
            org.apache.oozie.action.hadoop.SqoopActionExecutor,
            org.apache.oozie.action.hadoop.DistcpActionExecutor
        </value>
    </property>
    <property>
        <name>oozie.service.SchemaService.wf.ext.schemas</name>
        <value>shell-action-0.1.xsd,shell-action-0.2.xsd,email-action-0.1.xsd,hive-action-0.2.xsd,hive-action-0.3.xsd,hive-action-0.4.xsd,hive-action-0.5.xsd,sqoop-action-0.2.xsd,sqoop-action-0.3.xsd,ssh-action-0.1.xsd,ssh-action-0.2.xsd,distcp-action-0.1.xsd</value>
    </property>
    <property>
        <name>oozie.system.id</name>
        <value>oozie-${user.name}</value>
    </property>
    <property>
        <name>oozie.systemmode</name>
        <value>NORMAL</value>
    </property>
    <property>
        <name>oozie.service.AuthorizationService.security.enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>oozie.service.PurgeService.older.than</name>
        <value>30</value>
    </property>
    <property>
        <name>oozie.service.PurgeService.purge.interval</name>
        <value>3600</value>
    </property>
    <property>
        <name>oozie.service.CallableQueueService.queue.size</name>
        <value>10000</value>
    </property>
    <property>
        <name>oozie.service.CallableQueueService.threads</name>
        <value>10</value>
    </property>
    <property>
        <name>oozie.service.CallableQueueService.callable.concurrency</name>
        <value>3</value>
    </property>
    <property>
        <name>oozie.service.coord.normal.default.timeout
	</name>
	<value>120</value>
    </property>

    <property>
        <name>oozie.db.schema.name</name>
        <value>oozie</value>
    </property>
    <property>
        <name>oozie.service.JPAService.create.db.schema</name>
        <value>true</value>
    </property>

    <property>
        <name>oozie.service.JPAService.jdbc.driver</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>
    <property>
        <name>oozie.service.JPAService.jdbc.url</name>
        <value>jdbc:mysql://localhost:3306/oozie</value>
    </property>
    <property>
        <name>oozie.service.JPAService.jdbc.username</name>
        <value>oozie</value>
    </property>
    <property>
        <name>oozie.service.JPAService.jdbc.password</name>
        <value>oozie</value>
    </property>

    <property>
        <name>oozie.service.JPAService.pool.max.active.conn</name>
        <value>10</value>
    </property>

    <property>
        <name>oozie.service.HadoopAccessorService.kerberos.enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>local.realm</name>
        <value>LOCALHOST</value>
    </property>
    <property>
        <name>oozie.service.HadoopAccessorService.keytab.file</name>
        <value>${user.home}/oozie.keytab</value>
    </property>
    <property>
        <name>oozie.service.HadoopAccessorService.kerberos.principal</name>
        <value>${user.name}/localhost@${local.realm}</value>
    </property>
    <property>
        <name>oozie.service.HadoopAccessorService.jobTracker.whitelist</name>
        <value> </value>
    </property>
    <property>
        <name>oozie.service.HadoopAccessorService.nameNode.whitelist</name>
        <value> </value>
    </property>

    <property>
        <name>oozie.service.HadoopAccessorService.hadoop.configurations</name>
        <value>*=/etc/hadoop/conf</value>
    </property>
    <property>
        <name>oozie.service.WorkflowAppService.system.libpath</name>
        <value>/user/${user.name}/share/lib</value>
    </property>

    <property>
        <name>use.system.libpath.for.mapreduce.and.pig.jobs</name>
        <value>false</value>
    </property>

    <property>
        <name>oozie.authentication.type</name>
        <value>simple</value>
    </property>
    <property>
        <name>oozie.authentication.token.validity</name>
        <value>36000</value>
    </property>
    <property>
        <name>oozie.authentication.signature.secret</name>
        <value>oozie</value>
    </property>

    <property>
      <name>oozie.authentication.cookie.domain</name>
      <value></value>
    </property>

    <property>
        <name>oozie.authentication.simple.anonymous.allowed</name>
        <value>true</value>
    </property>

    <property>
        <name>oozie.authentication.kerberos.principal</name>
        <value>HTTP/localhost@${local.realm}</value>
    </property>

    <property>
        <name>oozie.authentication.kerberos.keytab</name>
        <value>${oozie.service.HadoopAccessorService.keytab.file}</value>
    </property>

    <property>
        <name>oozie.authentication.kerberos.name.rules</name>
        <value>DEFAULT</value>
    </property>

    <property>
        <name>oozie.service.ProxyUserService.proxyuser.oozie.hosts</name>
        <value>*</value>
    </property>

    <property>
        <name>oozie.service.ProxyUserService.proxyuser.oozie.groups</name>
        <value>*</value>
    </property>

    <property>
        <name>oozie.service.ProxyUserService.proxyuser.hue.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>oozie.service.ProxyUserService.proxyuser.hue.groups</name>
        <value>*</value>
    </property>

    <property>
        <name>oozie.action.mapreduce.uber.jar.enable</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.service.HadoopAccessorService.supported.filesystems</name>
        <value>hdfs,viewfs</value>
    </property>
</configuration>

5. 配置Oozie Web Console
$ cd /tmp/
$ wget http://archive.cloudera.com/gplextras/misc/ext-2.2.zip
$ cd /var/lib/oozie/
$ sudo unzip /tmp/ext-2.2.zip
$ cd ext-2.2/
$ sudo -u hdfs hadoop fs -mkdir /user/oozie
$ sudo -u hdfs hadoop fs -chown oozie:oozie /user/oozie

6. 配置Oozie ShareLib
$ mkdir /tmp/ooziesharelib
$ cd /tmp/ooziesharelib
$ tar xzf /usr/lib/oozie/oozie-sharelib.tar.gz
$ sudo -u oozie hadoop fs -put share /user/oozie/share
$ sudo -u oozie hadoop fs -ls /user/oozie/share
$ sudo -u oozie hadoop fs -ls /user/oozie/share/lib
$ sudo -u oozie hadoop fs -put /usr/lib/hive/lib/hbase.jar /user/oozie/share/lib/hive/
$ sudo -u oozie hadoop fs -put /usr/lib/hive/lib/zookeeper.jar /user/oozie/share/lib/hive/
$ sudo -u oozie hadoop fs -put /usr/lib/hive/lib/hive-hbase-handler-0.10.0-cdh4.5.0.jar /user/oozie/share/lib/hive/
$ sudo -u oozie hadoop fs -put /usr/lib/hive/lib/guava-11.0.2.jar /user/oozie/share/lib/hive/
$ sudo ln -s /usr/share/java/mysql-connector-java.jar /var/lib/oozie/mysql-connector-java.jar

7. 启动Oozie
$ sudo service oozie start

8. 访问Oozie Web Console
http://hadoop-secondary:11000/oozie

9. 至此,Oozie的搭建就已经完成。

, ,

5 Comments

Hadoop集群(CDH4)实践之 (3) Hive搭建

目录结构
Hadoop集群(CDH4)实践之 (0) 前言
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建
Hadoop集群(CDH4)实践之 (3) Hive搭建
Hadoop集群(CHD4)实践之 (4) Oozie搭建
Hadoop集群(CHD4)实践之 (5) Sqoop安装

本文内容
Hadoop集群(CDH4)实践之 (3) Hive搭建

参考资料
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html

环境准备
OS: CentOS 6.4 x86_64
Servers:
hadoop-master: 172.17.20.230 内存10G
- namenode
- hbase-master

hadoop-secondary: 172.17.20.234 内存10G
- secondarybackupnamenode,jobtracker
- hive-server,hive-metastore

hadoop-node-1: 172.17.20.231 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-2: 172.17.20.232 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-3: 172.17.20.233 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

对以上角色做一些简单的介绍:
namenode - 整个HDFS的命名空间管理服务
secondarynamenode - 可以看做是namenode的冗余服务
jobtracker - 并行计算的job管理服务
datanode - HDFS的节点服务
tasktracker - 并行计算的job执行服务
hbase-master - Hbase的管理服务
hbase-regionServer - 对Client端插入,删除,查询数据等提供服务
zookeeper-server - Zookeeper协作与配置管理服务
hive-server - Hive的管理服务
hive-metastore - Hive的元存储,用于对元数据进行类型检查与语法分析

本文定义的规范,避免在配置多台服务器上产生理解上的混乱:
以下操作都只需要在 Hive 所在主机,即 hadoop-secondary 上执行。

1. 安装前的准备
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建

2. 安装Hive
$ sudo yum install hive hive-metastore hive-server
$ sudo yum install hive-jdbc hive-hbase

3. 安装MySQL JDBC Connector
$ sudo yum install mysql-connector-java
$ sudo ln -s /usr/share/java/mysql-connector-java.jar /usr/lib/hive/lib/mysql-connector-java.jar

4. 安装MySQL
$ sudo yum install mysql-server
$ sudo /etc/init.d/mysqld start

$ sudo /usr/bin/mysql_secure_installation

 
[...]
Enter current password for root (enter for none):
OK, successfully used password, moving on...
[...]
Set root password? [Y/n] y
New password: hiveserver
Re-enter new password: hiverserver
Remove anonymous users? [Y/n] Y
[...]
Disallow root login remotely? [Y/n] N
[...]
Remove test database and access to it [Y/n] Y
[...]
Reload privilege tables now? [Y/n] Y
All done!

5. 创建数据库并授权
$ mysql -u root -phiveserver

 
mysql> CREATE DATABASE metastore;
mysql> USE metastore;
mysql> SOURCE /usr/lib/hive/scripts/metastore/upgrade/mysql/hive-schema-0.10.0.mysql.sql;

mysql> CREATE USER 'hive'@'%' IDENTIFIED BY 'hiveserver';
mysql> GRANT SELECT,INSERT,UPDATE,DELETE ON metastore.* TO 'hive'@'%';
mysql> REVOKE ALTER,CREATE ON metastore.* FROM 'hive'@'%';

mysql> CREATE USER 'hive'@'localhost' IDENTIFIED BY 'hiveserver';
mysql> GRANT SELECT,INSERT,UPDATE,DELETE ON metastore.* TO 'hive'@'localhost';
mysql> REVOKE ALTER,CREATE ON metastore.* FROM 'hive'@'localhost';

mysql> CREATE USER 'hive'@'127.0.0.1' IDENTIFIED BY 'hiveserver';
mysql> GRANT SELECT,INSERT,UPDATE,DELETE ON metastore.* TO 'hive'@'127.0.0.1';
mysql> REVOKE ALTER,CREATE ON metastore.* FROM 'hive'@'127.0.0.1';

6. 配置hive-site.xml
$ sudo vim /etc/hive/conf/hive-site.xml

 
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
  <name>javax.jdo.option.ConnectionURL</name>
  <value>jdbc:mysql://hadoop-secondary/metastore</value>
  <description>the URL of the MySQL database</description>
</property>
<property>
  <name>javax.jdo.option.ConnectionDriverName</name>
  <value>com.mysql.jdbc.Driver</value>
</property>
<property>
  <name>javax.jdo.option.ConnectionUserName</name>
  <value>hive</value>
</property>
<property>
  <name>javax.jdo.option.ConnectionPassword</name>
  <value>hiveserver</value>
</property>
<property>
  <name>datanucleus.autoCreateSchema</name>
  <value>false</value>
</property>
<property>
  <name>datanucleus.fixedDatastore</name>
  <value>true</value>
</property>
<property>
  <name>datanucleus.autoStartMechanism</name> 
  <value>SchemaTable</value>
</property> 
<property>
  <name>hive.metastore.uris</name>
  <value>thrift://hadoop-secondary:9083</value>
  <description>IP address (or fully-qualified domain name) and port of the metastore host</description>
</property>
<property>
  <name>hive.aux.jars.path</name>
  <value>file:////usr/lib/hive/lib/hbase.jar,file:///usr/lib/hive/lib/zookeeper.jar,file:///usr/lib/hive/lib/hive-hbase-handler-0.10.0-cdh4.5.0.jar,file:///usr/lib/hive/lib/guava-11.0.2.jar</value>
</property>
<property>
  <name>hbase.zookeeper.quorum</name>
  <value>hadoop-node-1,hadoop-node-2,hadoop-node-3</value>
</property>
</configuration>

7. 启动Hive
$ /etc/init.d/hive-metastore start
$ /etc/init.d/hive-server start

8. 创建Hive所需的HDFS目录
$ sudo -u hdfs hadoop fs -mkdir /user/hive
$ sudo -u hdfs hadoop fs -mkdir /user/hive/warehouse
$ sudo -u hdfs hadoop fs -ls -R /user
$ sudo -u hdfs hadoop fs -chown -R hive /user/hive
$ sudo -u hdfs hadoop fs -chmod -R 1777 /user/hive/warehouse

$ sudo -u hdfs hadoop fs -chmod -R 777 /tmp/hadoop-mapred
$ sudo -u hdfs hadoop fs -chmod -R 777 /tmp/hive-hive
$ sudo chown -R hive:hive /var/lib/hive/.hivehistory

9. 至此,Hive的搭建就已经完成。

, ,

No Comments

Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建

目录结构
Hadoop集群(CDH4)实践之 (0) 前言
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建
Hadoop集群(CDH4)实践之 (3) Hive搭建
Hadoop集群(CHD4)实践之 (4) Oozie搭建
Hadoop集群(CHD4)实践之 (5) Sqoop安装

本文内容
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建

参考资料
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html

环境准备
OS: CentOS 6.4 x86_64
Servers:
hadoop-master: 172.17.20.230 内存10G
- namenode
- hbase-master

hadoop-secondarynamenode: 172.17.20.234 内存10G
- secondarybackupnamenode,jobtracker

hadoop-node-1: 172.17.20.231 内存10G sudo yum install hbase-regionserver
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-2: 172.17.20.232 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

hadoop-node-3: 172.17.20.233 内存10G
- datanode,tasktracker
- hbase-regionserver,zookeeper-server

对以上角色做一些简单的介绍:
namenode - 整个HDFS的命名空间管理服务
secondarynamenode - 可以看做是namenode的冗余服务
jobtracker - 并行计算的job管理服务
datanode - HDFS的节点服务
tasktracker - 并行计算的job执行服务
hbase-master - Hbase的管理服务
hbase-regionServer - 对Client端插入,删除,查询数据等提供服务
zookeeper-server - Zookeeper协作与配置管理服务

本文定义的规范,避免在配置多台服务器上产生理解上的混乱:
所有直接以 $ 开头,没有跟随主机名的命令,都代表需要在所有的服务器上执行,除非后面有单独的//开头或在标题说明。

1. 安装前的准备
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建

配置NTP时钟同步
$ sudo yum install ntp
$ sudo /etc/init.d/ntpd start

配置ulimit与nproc参数
$ sudo vim /etc/security/limits.conf

 
hdfs  -       nofile  32768
hbase -       nofile  32768

退出并重新登录SSH使设置生效

2. 在hadoop-master上安装hbase-master
$ sudo yum install hbase-master
$ sudo yum install hbase-rest
$ sudo yum install hbase-thrift

3. 在hadoop-node上安装hbase-regionserver
$ sudo yum install hbase-regionserver

4. 在HDFS中创建HBase的目录
以下HDFS操作仅需在任意一台主机上执行一次
$ sudo -u hdfs hadoop fs -mkdir /hbase
$ sudo -u hdfs hadoop fs -chown hbase /hbase

5. 配置hbase-site.xml
$ sudo vim /etc/hbase/conf/hbase-site.xml
$ cat /etc/hbase/conf/hbase-site.xml

 
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
  <name>hbase.rest.port</name>
  <value>60050</value>
</property>
<property>
  <name>hbase.cluster.distributed</name>
  <value>true</value>
</property>
<property>
  <name>hbase.rootdir</name>
  <value>hdfs://hadoop-master:8020/hbase</value>
</property>
<property>
  <name>hbase.zookeeper.quorum</name>
  <value>hadoop-node-1,hadoop-node-2,hadoop-node-3</value>
</property>
</configuration>

6. 配置regionservers
$ sudo vim /etc/hbase/conf/regionservers

 
hadoop-node-1
hadoop-node-2
hadoop-node-3

7. 安装Zookeeper
$ sudo yum install zookeeper
$ sudo vim /etc/zookeeper/conf/zoo.cfg
$ cat /etc/zookeeper/conf/zoo.cfg

 
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/var/lib/zookeeper
clientPort=2181
maxClientCnxns=0
server.1=hadoop-node-1:2888:3888
server.2=hadoop-node-2:2888:3888
server.3=hadoop-node-3:2888:3888

8. 在hadoop-node上安装zookeeper-server并创建myid文件
$ sudo yum install zookeeper-server
$ sudo touch /var/lib/zookeeper/myid
$ sudo chown -R zookeeper:zookeeper /var/lib/zookeeper
$ echo 1 > /var/lib/zookeeper/myid //仅在hadoop-node-1上执行
$ echo 2 > /var/lib/zookeeper/myid //仅在hadoop-node-2上执行
$ echo 3 > /var/lib/zookeeper/myid //仅在hadoop-node-3上执行

$ sudo /etc/init.d/zookeeper-server init //仅在任一hadoop-node上执行一次
$ sudo /etc/init.d/zookeeper-server start

9. 启动Hbase服务
仅在hadoop-master上
$ sudo /etc/init.d/hbase-master start
$ sudo /etc/init.d/hbase-thrift start
$ sudo /etc/init.d/hbase-rest start

仅在hadoop-node上
$ sudo /etc/init.d/hbase-regionserver start

10. 查看服务的状态
通过网页查看 http://hadoop-master:60010

11. 至此,HBase&Zookeeper的搭建就已经完成。

, , ,

No Comments

Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建

目录结构
Hadoop集群(CDH4)实践之 (0) 前言
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建
Hadoop集群(CDH4)实践之 (2) HBase&Zookeeper搭建
Hadoop集群(CDH4)实践之 (3) Hive搭建
Hadoop集群(CHD4)实践之 (4) Oozie搭建
Hadoop集群(CHD4)实践之 (5) Sqoop安装

本文内容
Hadoop集群(CDH4)实践之 (1) Hadoop(HDFS)搭建

参考资料
http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Installation-Guide/CDH4-Installation-Guide.html

环境准备
OS: CentOS 6.4 x86_64
Servers:
hadoop-master: 172.17.20.230 内存10G
- namenode

hadoop- secondarynamenode: 172.17.20.234 内存10G
- secondarybackupnamenode,jobtracker

hadoop-node-1: 172.17.20.231 内存10G
- datanode,tasktracker

hadoop-node-2: 172.17.20.232 内存10G
- datanode,tasktracker

hadoop-node-3: 172.17.20.233 内存10G
- datanode,tasktracker

对以上角色做一些简单的介绍:
namenode - 整个HDFS的命名空间管理服务
secondarynamenode - 可以看做是namenode的冗余服务
jobtracker - 并行计算的job管理服务
datanode - HDFS的节点服务
tasktracker - 并行计算的job执行服务

本文定义的规范,避免在配置多台服务器上产生理解上的混乱:
所有直接以 $ 开头,没有跟随主机名的命令,都代表需要在所有的服务器上执行,除非后面有单独的//开头或在标题说明。

1. 选择最好的安装包
为了更方便和更规范的部署Hadoop集群,我们采用Cloudera的集成包。
因为Cloudera对Hadoop相关的系统做了很多优化,避免了很多因各个系统间版本不符产生的很多Bug。
这也是很多资深Hadoop管理员所推荐的。
https://ccp.cloudera.com/display/DOC/Documentation/

2. 安装Java环境
由于整个Hadoop项目主要是通过Java开发完成的,因此需要JVM的支持。
登陆www.oracle.com(需要创建一个ID),从以下地址下载一个64位的JDK,如jdk-7u45-linux-x64.rpm
http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html

$ sudo rpm -ivh jdk-7u45-linux-x64.rpm
$ sudo vim /etc/profile.d/java.sh

 
export JAVA_HOME=/usr/java/jdk1.7.0_45
export JRE_HOME=$JAVA_HOME/jre
export CLASSPATH=.:$JAVA_HOME/lib:$JRE_HOME/lib:$CLASSPATH
export PATH=$JAVA_HOME/bin:$JRE_HOME/bin:$PATH

$ sudo chmod +x /etc/profile.d/java.sh
$ source /etc/profile

3. 配置Hadoop安装源
$ sudo rpm --import http://archive.cloudera.com/cdh4/redhat/5/x86_64/cdh/RPM-GPG-KEY-cloudera
$ cd /etc/yum.repos.d/
$ sudo wget http://archive.cloudera.com/cdh4/redhat/6/x86_64/cdh/cloudera-cdh4.repo

4. 安装Hadoop相关套件,选择MRv1的框架支持
$ sudo yum install hadoop-hdfs-namenode //仅在hadoop-master上安装

$ sudo yum install hadoop-hdfs-secondarynamenode //仅在hadoop-secondary上安装
$ sudo yum install hadoop-0.20-mapreduce-jobtracker //仅在hadoop-secondary上安装

$ sudo yum install hadoop-hdfs-datanode //仅在hadoop-node上安装
$ sudo yum install hadoop-0.20-mapreduce-tasktracker //仅在hadoop-node上安装

$ sudo yum install hadoop-client

5. 创建Hadoop配置文件
$ sudo cp -r /etc/hadoop/conf.dist /etc/hadoop/conf.my_cluster

6. 激活新的配置文件
$ sudo alternatives --verbose --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.my_cluster 50
$ sudo alternatives --set hadoop-conf /etc/hadoop/conf.my_cluster
$ cd /etc/hadoop/conf

7. 添加hosts记录并修改对应的主机名
$ sudo vim /etc/hosts

 
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6

172.17.20.230 hadoop-master
172.17.20.234 hadoop-secondary
172.17.20.231 hadoop-node-1
172.17.20.232 hadoop-node-2
172.17.20.233 hadoop-node-3

8. 安装LZO支持
$ cd /etc/yum.repos.d
$ sudo wget http://archive.cloudera.com/gplextras/redhat/6/x86_64/gplextras/cloudera-gplextras4.repo
$ sudo yum install hadoop-lzo-cdh4

9. 配置hadoop/conf下的文件
$ sudo vim /etc/hadoop/conf/masters

 
hadoop-master

$ sudo vim /etc/hadoop/conf/slaves

 
hadoop-node-1
hadoop-node-2
hadoop-node-3

10. 创建hadoop的HDFS目录
$ sudo mkdir -p /data/{1,2,3,4}/mapred/local
$ sudo chown -R mapred:hadoop /data/{1,2,3,4}/mapred/local

$ sudo mkdir -p /data/1/dfs/nn /nfsmount/dfs/nn /data/1/dfs/ns /data/{1,2,3,4}/dfs/dn
$ sudo chown -R hdfs:hdfs /data/1/dfs/nn /nfsmount/dfs/nn /data/1/dfs/ns /data/{1,2,3,4}/dfs/dn
$ sudo chmod 700 /data/1/dfs/nn /nfsmount/dfs/nn /data/1/dfs/ns /data/{1,2,3,4}/dfs/dn

$ sudo mkdir /data/tmp
$ sudo chmod 1777 /data/tmp

11. 配置core-site.xml
$ sudo vim /etc/hadoop/conf/core-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
 <name>fs.defaultFS</name>
 <value>hdfs://hadoop-master:8020</value>
</property>
<property>
 <name>hadoop.tmp.dir</name>
 <value>/data/tmp/hadoop-${user.name}</value>
</property>

<property>
  <name>hadoop.proxyuser.oozie.hosts</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.oozie.groups</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.hive.hosts</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.hive.groups</name>
  <value>*</value>
</property>
</configuration>

12. 配置hdfs-site.xml
$ sudo vim /etc/hadoop/conf/hdfs-site.xml

 
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
 <name>dfs.namenode.name.dir</name>
 <value>/data/1/dfs/nn,/nfsmount/dfs/nn</value>
</property>
<property>
  <name>dfs.namenode.http-address</name>
  <value>hadoop-master:50070</value>
</property>

<property>
  <name>fs.namenode.checkpoint.period</name>
  <value>3600</value>
</property>
<property>
  <name>fs.namenode.checkpoint.dir</name>
  <value>/data/1/dfs/ns</value>
</property>
<property>
  <name>dfs.namenode.secondary.http-address</name>
  <value>hadoop-secondary:50090</value>
</property>

<property>
  <name>dfs.replication</name>
  <value>3</value>
</property>
<property>
 <name>dfs.permissions.superusergroup</name>
 <value>supergroup</value>
</property>
<property>
 <name>dfs.datanode.data.dir</name>
 <value>/data/1/dfs/dn,/data/2/dfs/dn,/data/3/dfs/dn</value>
</property>
<property>
  <name>dfs.datanode.max.xcievers</name>
  <value>4096</value>
</property>
</configuration>

阅读全文 »

, ,

4 Comments