大数据相关组件配置示例
包含 HDFS、Hive、HBase 和 Spark
一、Hadoop 集群搭建
通过 docker 简单搭建 Hadoop 集群
1. 创建工作目录
$ mkdir /opt/hadoop
# 需要创建配置2个文件
# hadoop.env 配置 hadoop 集群相关信息
# docker-compose.yml 配置相关 docker 容器信息
2. hadoop.env
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=true
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
3. docker-compose.yml
version: "3"
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
restart: always
container_name: namenode
hostname: namenode
volumes:
- /opt/hadoop/data/hadoop_namenode:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
ports:
- "9870:9870"
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.1.2-java8
restart: always
container_name: resourcemanager
hostname: resourcemanager
ports:
- "8088:8088"
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
env_file:
- ./hadoop.env
environment:
- YARN_CONF_yarn_resourcemanager_webapp_address=0.0.0.0:8088
historyserver:
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.1.2-java8
restart: always
container_name: historyserver
hostname: historyserver
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
volumes:
- /opt/hadoop/data/hadoop_historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
ports:
- "8188:8188"
nodemanager1:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.1.2-java8
restart: always
container_name: nodemanager1
hostname: nodemanager1
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
env_file:
- ./hadoop.env
ports:
- "8042:8042"
datanode1:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.2-java8
restart: always
container_name: datanode1
hostname: datanode1
depends_on:
- namenode
ports:
- "5642:5642"
volumes:
- /opt/hadoop/data/hadoop_datanode1:/hadoop/dfs/data
env_file:
- ./hadoop.env
environment:
- HDFS_CONF_dfs_datanode_address=0.0.0.0:5640
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:5641
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:5642
datanode2:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.2-java8
restart: always
container_name: datanode2
hostname: datanode2
depends_on:
- namenode
ports:
- "5645:5645"
volumes:
- /opt/hadoop/data/hadoop_datanode2:/hadoop/dfs/data
env_file:
- ./hadoop.env
environment:
- HDFS_CONF_dfs_datanode_address=0.0.0.0:5643
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:5644
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:5645
datanode3:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.2-java8
restart: always
container_name: datanode3
hostname: datanode3
depends_on:
- namenode
ports:
- "5648:5648"
volumes:
- /opt/hadoop/data/hadoop_datanode3:/hadoop/dfs/data
env_file:
- ./hadoop.env
environment:
- HDFS_CONF_dfs_datanode_address=0.0.0.0:5646
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:5647
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:5648
4. 各服务端口说明
可以自行修改对应端口,需要避免端口冲突
namenode:默认 9870
resourcemanager:默认8088,如果和其他项目的端口冲突,修改以后,那端口映射也需要修改。
比如8088 修改为 5888:
ports:
- "5888:5888"
environment:
- YARN_CONF_yarn_resourcemanager_webapp_address=0.0.0.0:5888historyserver:默认8188
nodemanager:默认8042
datanode1:这里需要注意下,默认端口是 9864,这里不使用默认端口是为了在同一机器起多个 datanode,暴露端口需要不同
datanode2:同上,端口不同即可
datanode3:同上,端口不同即可
挂载目录自己选择,这里挂载到 /opt/hadoop/data 下面,会自动创建 data 目录
5. 启动
$ cd /opt/hadoop
$ docker-compose up -d
6. Web端访问
直接通过宿主机IP+对于服务器端口即可访问
注意:在HDFS页面查看文件内容、下载文件,需要在本机器的hosts文件添加以下数据节点映射地址
配置hosts
宿主机IP namenode
宿主机IP datanode1
宿主机IP datanode2
宿主机IP datanode3
7. 配置文件
# 配置文件位于 NameNode 容器 /opt/hadoop-3.2.1/etc/hadoop 目录下
将core-site.xml文件和 hdfs-site.xml文件复制出来,下载一个 hadoop 的安装包,将配置文件替换,即可通过命令行访问 hdfs
8. 获取 Active NameNode IP
当 Hadoop 集群有多个 NameNode 节点时(一般为 2 个互为主备),配置文件中可能无法使用虚拟 host 来实现 HA,需要在配置文件中手动指定 Active NameNode 的 IP,但是当 Hadoop 集群主备切换时,配置文件需要更新,如果手动更新会很麻烦,可以参考下面脚本,脚本用于获取 Active NameNode IP;然后在 runtime 中启动一个定时任务流程,更新配置文件即可
#!/bin/bash
active_node=''
namenodes=("172.18.89.103" "172.18.89.104")
for value in "${namenodes[@]}"
do
if hadoop fs -test -e hdfs://$value:25000/ >/dev/null 2>&1;then
active_node=$value
break
fi
done
echo "$active_node"