def coalesce ( numPartitions : Int , shuffle : Boolean = false ): RDD [T]
def repartition ( numPartitions : Int ): RDD [T]
coalesce和repartition将關聯資料合并到給定數量的分區中。 repartition對資料進行shuffle
java代碼如下:
package com.cb.spark.sparkrdd;
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class CoalesceExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Coalesce").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 10);
System.out.println(rdd.partitions().size());// 10
JavaRDD<Integer> coalesceRDD = rdd.coalesce(2, false);// 将原來十個partition的資料合并到兩個partition當中
System.out.println(coalesceRDD.partitions().size());// 2
JavaRDD<Integer> repartitionRDD = rdd.repartition(5);
System.out.println(repartitionRDD.partitions().size());// 5
jsc.stop();
}
}