Spark reduceByKey用法

List类型

代码:

object SparkTest {

    def main(args: Array[String]): Unit = {
        val conf: SparkConf = new SparkConf()
                .setMaster("local[*]")
                .setAppName("SparkTest")

        val sc: SparkContext = new SparkContext(conf)

        val data = Array(("a", List((1L, 2))),("a", List((5L, 6))), ("b", List((2L, 3))))
        val rdd = sc.parallelize(data)
        rdd.reduceByKey(_ ::: _)
                .foreach(println)
    }
}

运行结果:

(b,List((2,3)))
(a,List((1,2), (5,6)))

Set类型

代码:

object SparkTest {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("SparkTest")

    val sc: SparkContext = new SparkContext(conf)

    val list = List(("张三", "男"), ("李四", "男"), ("李四", "女"), ("王五", "女"))
    val rdd = sc.parallelize(list)

    rdd.map(x => {
      (x._1, Set(x._2))
    }).reduceByKey(_ ++ _).collect().foreach(println)

    sc.stop()
  }
}

运行结果:

(张三,Set(男))
(王五,Set(女))
(李四,Set(男, 女))
如果觉得我的文章对你有用,请随意赞赏