package cn.edu360.day8 import org.apache.spark.sql.{DataFrame, SparkSession} /** * Created by zx on 2017/10/16. */ object JoinTest { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("CsvDataSource") .master("local[*]") .getorCreate() import spark.implicits._ //import org.apache.spark.sql.functions._ //spark.sql.autobroadcastJoinThreshold=-1 spark.conf.set("spark.sql.autobroadcastJoinThreshold", -1) //spark.conf.set("spark.sql.join.preferSortMergeJoin", true) //println(spark.conf.get("spark.sql.autobroadcastJoinThreshold")) val df1 = Seq( (0, "playing"), (1, "with"), (2, "join") ).toDF("id", "token") val df2 = Seq( (0, "P"), (1, "W"), (2, "S") ).toDF("aid", "atoken") df2.repartition() //df1.cache().count() val result: DataFrame = df1.join(df2, $"id" === $"aid") //查看执行计划 result.explain() result.show() spark.stop() } }
spark.conf.set(“spark.sql.autobroadcastJoinThreshold”, -1)
broadcast Join默认是10M -1代表不使用,可以修改-1的值
merge join 先进行每个表的排序,然后join
hashShuffle join 对数值取hash到不同分区进行join操作(默认)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。