CentOS上搭建Spark2.4.7集群

1.前提条件

java、scala、hadoop、zookeeper安装

2.解压spark

tar -zxvf spark-2.4.7-bin-hadoop2.7.tgz
mv spark-2.4.7-bin-hadoop2.7 spark-2.4.7

2.添加环境变量

vim /etc/profile
export SPARK_HOME=/app/hadoop/software/spark-2.4.7
export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH

3.创建目录

mkdir -p {pid,logs,tmpdata}
# 创建history记录存储目录
hadoop fs -mkdir -p /spark/log/historyEventLog

4.修改spark-env.sh文件

cp conf/spark-env.sh.template conf/spark-env.sh
vim conf/spark-env.sh

配置如下:

SPARK_HOME=/app/hadoop/software/spark-2.4.7
export JAVA_HOME=/usr/local/jdk1.8.0_241/
SPARK_MASTER_WEBUI_PORT=8080
SPARK_PID_DIR="${SPARK_HOME}/pid"
SPARK_MASTER_HOST=data-repo-04
SPARK_MASTER_PORT=7077
# 配置hadoop目录目录,也可以在/etc/profile中配置,所有的spark节点上都要放置hadoop安装目录(这样就不需要将hadoop相关配置copy到spark/conf中)
export HADOOP_CONF_DIR=/app/hadoop/software/hadoop-2.9.2/etc/hadoop

5.修改spark-defaults.conf文件

cp conf/spark-defaults.conf.template conf/spark-defaults.conf
vim conf/spark-defaults.conf

修改内容:

spark.executor.extraJavaOptions -Xss1M
spark.port.maxRetries 32
spark.driver.extraJavaOptions -Djava.library.path=/app/hadoop/software/hadoop-2.9.2/lib/native
spark.local.dir /app/hadoop/software/spark-2.4.7/tmpdata

# history server
spark.master=spark://data-repo-04:7077
spark.eventLog.enabled=true
spark.eventLog.dir=hdfs://hadoop-cluster/spark/log/historyEventLog
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.driver.memory=1g
spark.history.fs.logDirectory=hdfs://hadoop-cluster/spark/log/historyEventLog
spark.history.ui.port=18080
spark.history.fs.update.interval=10s
spark.history.retainedApplications=50
spark.history.fs.cleaner.enabled=false
spark.history.fs.cleaner.interval=1d
spark.history.fs.cleaner.maxAge=7d
spark.history.ui.acls.enable=false

6.修改slaves文件

cp conf/slaves.template conf/slaves
vim conf/slaves

添加worker主机列表

data-repo-05
data-repo-06
data-repo-07

7.将要hadoop安装目录copy到data-repo-06data-repo-07节点

#目前hadoop节点,data-repo-01~data-repo-5节点
scp -r hadoop-2.9.2/ hadoop@data-repo-06:~/software/
scp -r hadoop-2.9.2/ hadoop@data-repo-07:~/software/

设置data-repo-06data-repo-07节点的hadoop和spark环境变量

vim /etc/profile

export HADOOP_HOME=/app/hadoop/software/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib"
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native/:$LD_LIBRARY_PATH

8.将spark安装目录copy到worker节点

scp -r spark-2.4.7/ hadoop@data-repo-05:~/software/
scp -r spark-2.4.7/ hadoop@data-repo-06:~/software/
scp -r spark-2.4.7/ hadoop@data-repo-07:~/software/

设置data-repo-05~data-repo-07节点的spark环境变量

export SPARK_HOME=/app/hadoop/software/spark-2.4.7
export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH

9.启动data-repo-05(在spark master节点中执行)

./sbin/start-master.sh

10.启动worker节点(在spark master节点中执行)

./sbin/start-slaves.sh

11.启动history-server

history-server主要是记录spark任务的执行记录,用于定位问题

 ./sbin/start-history-server.sh 

12.访问spark webui

spark webui:http://data-repo-04:8080/
spark history-server:http://data-repo-04:18080/

13.测试spark

spark-submit --class org.apache.spark.examples.SparkPi --master spark://data-repo-04:7077 --driver-cores 8 --driver-memory 1G --executor-memory 2G --total-executor-cores 2 examples/jars/spark-examples_2.11-2.4.7.jar 1000

14.测试spark读写hdfs

# 进入spark-shell
spark-shell

# 执行spark代码
val rdd = sc.textFile("/input/test.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
rdd.saveAsTextFile("hdfs://hadoop-cluster/output_sparktest")

查看hads是否有输出结果

hadoop fs -ls /

spark集成Hive

1.将Hive安装目录copy一份到spark节点

# 目前hive节点为data-repo-01和data-repo-02,将hive安装目录copy到其他spark节点
scp -r hive-3.1.2/ hadoop@dataclear-repo-04:~/software/
scp -r hive-3.1.2/ hadoop@data-repo-05:~/software/
scp -r hive-3.1.2/ hadoop@data-repo-06:~/software/
scp -r hive-3.1.2/ hadoop@data-repo-07:~/software/

并设置环境变量

vim /etc/profile

export HIVE_HOME=/app/hadoop/software/hive-3.1.2
export HIVE_CONF_DIR=$HIVE_HOME/conf
export PATH=$PATH:$HIVE_HOME/bin

source /etc/profile

2.将hive中的hive-site.xml文件copy到spark/conf目录下

cp ../hive-3.1.2/conf/hive-site.xml conf/

3.将hive中的MySQL驱动jar文件copy到spark/jars目录下

cp ../hive-3.1.2/lib/mysql-connector-java-5.1.49.jar jars/

4.重启spark(在spark master节点中执行)

./sbin/stop-slaves.sh
./sbin/stop-master.sh

./sbin/start-master.sh
./sbin/start-slaves.sh

5.进入spark-sql验证

spark-sql

# 查看hive所有表
show tables;

spark-sql

如果觉得我的文章对你有用,请随意赞赏