1. 驱动程序(driver program)---->
Master URLs:
seqOp seqOp会被并行执行,具体由各个executor上的task来完成计算
val z = sc.parallelize(List(1,2,3,4,5,6), 2)
z.aggregate(0)(math.max(_, _), _ + _)
res40: Int = 9
val z =sc.parallelize(List("a","b","c","d","e","f"),2)
z.aggregate("")(_ + _, _+_)
res115: String = abcdef
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)]
def coalesce ( numPartitions : Int , shuffle : Boolean= false ): RDD [T]
val y = sc.parallelize(1 to 10, 10)
val z = y.coalesce(2, false)
res9: Int = 2
Listing Variants
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def cogroup[W](other: RDD[(K, W)], numPartitions:Int): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def groupWith[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def groupWith[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.map((_, "b"))
val c = a.map((_, "c"))
res7: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
val d = a.map((_, "d"))
b.cogroup(c, d).collect
res9: Array[(Int, (Seq[String], Seq[String],Seq[String]))] = Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d,d)))
val x = sc.parallelize(List((1, "apple"),(2, "banana"), (3, "orange"), (4, "kiwi")), 2)
val y = sc.parallelize(List((5, "computer"),(1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
res23: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),
def collect(): Array[T]
def collect[U: ClassTag](f: PartialFunction[T, U]):RDD[U]
def toArray(): Array[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu,Rat)
def collectAsMap(): Map[K, V]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.zip(a)
res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1-> 1, 3 -> 3)
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, numPartitions:Int): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner:Partitioner, mapSideCombine: Boolean = true, serializerClass: String = null):RDD[(K, C)]
val a =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3)
val c = b.zip(a)
val d = c.combineByKey(List(_), (x:List[String],y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
res16: Array[(Int, List[String])] = Array((1,List(cat,dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))
def compute(split: Partition, context: TaskContext):Iterator[T]
context, sparkContext:
def compute(split: Partition, context: TaskContext):Iterator[T]
def count(): Long
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res2: Long = 4
def (timeout: Long, confidence: Double = 0.95):PartialResult[BoundedDouble]
countByKey [Pair]:
def countByKey(): Map[K, Long]
val c = sc.parallelize(List((3, "Gnu"), (3,"Yak"), (5, "Mouse"), (3, "Dog")), 2)
res3: scala.collection.Map[Int,Long] = Map(3 -> 3,5 -> 1)
def countByValue(): Map[T, Long]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
res27: scala.collection.Map[Int,Long] = Map(5 -> 1,8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)
def countByValueApprox(timeout: Long, confidence:Double = 0.95): PartialResult[Map[T, BoundedDouble]]
def countApproxDistinct(relativeSD: Double = 0.05): Long
val a = sc.parallelize(1 to 10000, 20)
val b = a++a++a++a++a
val a = sc.parallelize(1 to 30000, 30)
val b = a++a++a++a++a
res28: Long = 30097
countApproxDistinctByKey [Pair]:
def countApproxDistinctByKey(relativeSD: Double =0.05): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,numPartitions: Int): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,partitioner: Partitioner): RDD[(K, Long)]
val a = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
val b = sc.parallelize(a.takeSample(true, 10000, 0),20)
val c = sc.parallelize(1 to b.count().toInt, 20)
val d = b.zip(c)
res15: Array[(String, Long)] = Array((Rat,2567),(Cat,3357), (Dog,2414), (Gnu,2494))
res16: Array[(String, Long)] = Array((Rat,2555),(Cat,2455), (Dog,2425), (Gnu,2513))
res0: Array[(String, Long)] = Array((Rat,2562),(Cat,2464), (Dog,2451), (Gnu,2521))
final def dependencies: Seq[Dependency[_]]
val b = sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
b: org.apache.spark.rdd.RDD[Int] =ParallelCollectionRDD[32] at parallelize at <console>:12
Int = 0
b.map(a => a).dependencies.length
res40: Int = 1
res41: Int = 2
res42: Seq[org.apache.spark.Dependency[_]] =List(org.apache.spark.rdd.CartesianRDD$$anon$1@576ddaaa,org.apache.spark.rdd.CartesianRDD$$anon$2@6d2efbbd)
def distinct(): RDD[T]
def distinct(numPartitions: Int): RDD[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res6: Array[String] = Array(Dog, Gnu, Cat, Rat)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10))
res16: Int = 2
res17: Int = 3
def first(): T
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res1: String = Gnu
非常常用的功能,内部使用返回布尔值的方法,对RDD中的每个data使用该方法,返回result RDD
def filter(f: T => Boolean): RDD[T]
val a = sc.parallelize(1 to 10, 3)
a.filter(_ % 2 == 0)
res3: Array[Int] = Array(2, 4, 6, 8, 10)
Examples for mixed data without partial functions:
val b = sc.parallelize(1 to 8)
b.filter(_ < 4).collect
res15: Array[Int] = Array(1, 2, 3)
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.filter(_ < 4).collect
<console>:15: error: value < is not a memberof Any
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.collect({case a: Int => "is integer" |
caseb: String => "is string" }).collect
res17: Array[String] = Array(is string, is string, isinteger, is string)
val myfunc: PartialFunction[Any, Any] = {
case a:Int => "is integer" |
case b: String=> "is string" }
myfunc.isDefinedAt("") 判断myfunc是否支持
res21: Boolean = true
res22: Boolean = true
myfunc.isDefinedAt(1.5) 不支持
res23: Boolean = false
Our research group has a very strong focus on usingand improving Apache Spark to solve real world programs. In order to do this weneed to have a very solid understanding of the capabilities of Spark. So one ofthe first
things we have done is to go through the entire Spark RDD API andwrite examples to test their functionality. This has been a very usefulexercise and we would like to share the examples with everyone.
Authors of examples: Matthias Langer and Zhen He
Emails addresses: m.langer@latrobe.edu.au,z.he@latrobe.edu.au
These examples have only been tested for Spark version0.9. We assume the functionality of Spark is stable and therefore the examplesshould be valid for later releases.
Here is a pdf of the all the examples: SparkExamples
The RDD API By Example
RDD is short for Resilient Distributed Dataset. RDDsare the workhorse of the Spark system. As a user, one can consider a RDD as ahandle for a collection of individual data partitions, which are the result ofsome computation.
However, an RDD is actually more than that. On clusterinstallations, separate data partitions can be on separate nodes. Using the RDDas a handle one can access all partitions and perform computations andtransformations
using the contained data. Whenever a part of a RDD or an entireRDD is lost, the system is able to reconstruct the data of lost partitions byusing lineage information. Lineage refers to the sequence of transformationsused to produce the current RDD. As a result,
Spark is able to recoverautomatically from most failures.
All RDDs available in Spark derive either directly orindirectly from the class RDD. This class comes with a large set of methodsthat perform operations on the data within the associated partitions. The classRDD is
abstract. Whenever, one uses a RDD, one is actually using a concertizedimplementation of RDD. These implementations have to overwrite some corefunctions to make the RDD behave as expected.
One reason why Spark has lately become a very popularsystem for processing big data is that it does not impose restrictionsregarding what data can be stored within RDD partitions. The RDD API alreadycontains many useful
operations. But, because the creators of Spark had to keepthe core API of RDDs common enough to handle arbitrary data-types, manyconvenience functions are missing.
The basic RDD API considers each data item as a singlevalue. However, users often want to work with key-value pairs. Therefore Sparkextended the interface of RDD to provide additional functions(PairRDDFunctions), which
explicitly work on key-value pairs. Currently, thereare four extensions to the RDD API available in spark. They are as follows:
This extension contains many useful methods foraggregating numeric values. They become available if the data items of an RDDare implicitly convertible to the Scala data-type double.
Methods defined in this interface extension becomeavailable when the data items have a two component tuple structure. Spark willinterpret the first tuple item (i.e. tuplename. 1) as the key and the seconditem (i.e.
tuplename. 2) as the associated value.
Methods defined in this interface extension becomeavailable if the data items are two-component tuples where the key isimplicitly sortable.
This extension contains several methods that allowusers to create Hadoop sequence- les from RDDs. The data items must be twocompo- nent key-value tuples as required by the PairRDDFunctions. However,there are additional
requirements considering the convertibility of the tuplecomponents to Writable types.
Since Spark will make methods with extendedfunctionality automatically available to users when the data items fulfill theabove described requirements, we decided to list all possible availablefunctions in strictly
alphabetical order. We will append either of thefollowingto the function-name to indicate it belongs to an extension thatrequires the data items to conform to a certain format or type.
[Double] - Double RDD Functions
[Ordered] - OrderedRDDFunctions
[Pair] - PairRDDFunctions
[SeqFile] - SequenceFileRDDFunctions
The aggregate-method provides an interface forperforming highly customized reductions and aggregations with a RDD. However,due to the way Scala and Spark execute and process data, care must be taken toachieve deterministic
behavior. The following list contains a few observationswe made while experimenting with aggregate:
The reduceand combine functions have to be commutative and associative.
As can beseen from the function definition below, the output of the combiner must beequal to its input. This is necessary because Spark will chain-execute it.
The zerovalue is the initial value of the U component when either seqOp or combOp areexecuted for the first element of their domain of influence. Depending on whatyou want to achieve, you may have to change it.
However, to make your codedeterministic, make sure that your code will yield the same result regardlessof the number or size of partitions.
Do notassume any execution order for either partition computations or combiningpartitions.
The neutralzeroValue is applied at the beginning of each sequence of reduces within theindividual partitions and again when the output of separate partitions iscombined.
Why have twoseparate combine functions? The first functions maps the input values into theresult space. Note that the aggregation data type (1st input and output) can bedifferent (U != T). The second function reduces
these mapped values in theresult space.
Why wouldone want to use two input data types? Let us assume we do an archaeologicalsite survey using a metal detector. While walking through the site we take GPScoordinates of important findings based on the output
of the metal detector.Later, we intend to draw an image of a map that highlights these locationsusing the aggregate function. In this case the zeroValue could be an area mapwith no highlights. The possibly huge set of input data is stored as GPScoordinates
across many partitions. seqOp could convert the GPS coordinates tomap coordinates and put a marker on the map at the respective position. combOpwill receive these highlights as partial maps and combine them into a singlefinal output map.
Listing Variants
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T)=> U, combOp: (U, U) => U): U
Examples 1
val z = sc.parallelize(List(1,2,3,4,5,6), 2)
z.aggregate(0)(math.max(_, _), _ + _)
res40: Int = 9
val z =sc.parallelize(List("a","b","c","d","e","f"),2)
z.aggregate("")(_ + _, _+_)
res115: String = abcdef
z.aggregate("x")(_ + _, _+_)
res116: String = xxdefxabc
val z =sc.parallelize(List("12","23","345","4567"),2)
z.aggregate("")((x,y) =>math.max(x.length, y.length).toString, (x,y) => x + y)
res141: String = 42
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res142: String = 11
val z =sc.parallelize(List("12","23","345",""),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res143: String = 10
The main issue with the code above is that the resultof the inner min is a string of length 1.
The zero in the output is due to the empty string beingthe last string in the list. We see this result because we are not recursivelyreducing any further within the partition for the final string.
Examples 2
val z =sc.parallelize(List("12","23","","345"),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res144: String = 11
In contrast to the previous example, this example hasthe empty string at the beginning of the second partition. This results inlength of zero being input to the second reduce which then upgrades it a lengthof 1. (Warning:
The above example shows bad design since the output isdependent on the order of the data inside the partitions.)
Computes the cartesian product between two RDDs (i.e.Each item of the first RDD is joined with each item of the second RDD) andreturns them as a new RDD. (Warning: Be careful when using this function.!Memory consumption
can quickly become an issue!)
Listing Variants
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)]
val x = sc.parallelize(List(1,2,3,4,5))
val y = sc.parallelize(List(6,7,8,9,10))
res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8),(1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9),(3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10))
Will create a checkpoint when the RDD is computednext. Checkpointed RDDs are stored as a binary file within the checkpointdirectory which can be specified using the Spark context. (Warning: Spark applieslazy evaluation.
Checkpointing will not occur until an action is invoked.)
Important note: the directory "my_directory_name" should exist inall slaves. As an alternative you could use an HDFS directory URL as well.
Listing Variants
def checkpoint()
val a = sc.parallelize(1 to 4)
14/02/25 18:13:53 INFO SparkContext: Starting job:count at <console>:15
14/02/25 18:13:53 INFO MemoryStore: Block broadcast_5stored as values to memory (estimated size 115.7 KB, free 296.3 MB)
14/02/25 18:13:53 INFO RDDCheckpointData: Donecheckpointing RDD 11 tofile:/home/cloudera/Documents/spark-0.9.0-incubating-bin-cdh4/bin/my_directory_name/65407913-fdc6-4ec1-82c9-48a1656b95d6/rdd-11,new parent is RDD
res23: Long = 4
coalesce, repartition
Coalesces the associated data into a given number ofpartitions. repartition(numPartitions) is simply an abbreviation forcoalesce(numPartitions, shuffle = true).
Listing Variants
def coalesce ( numPartitions : Int , shuffle : Boolean= false ): RDD [T]
def repartition ( numPartitions : Int ): RDD [T]
val y = sc.parallelize(1 to 10, 10)
val z = y.coalesce(2, false)
res9: Int = 2
cogroup [Pair], groupWith [Pair]
A very powerful set of functions that allow groupingup to 3 key-value RDDs together using their keys.
Listing Variants
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def cogroup[W](other: RDD[(K, W)], numPartitions:Int): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def groupWith[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def groupWith[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.map((_, "b"))
val c = a.map((_, "c"))
res7: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
val d = a.map((_, "d"))
b.cogroup(c, d).collect
res9: Array[(Int, (Seq[String], Seq[String],Seq[String]))] = Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d,d)))
val x = sc.parallelize(List((1, "apple"),(2, "banana"), (3, "orange"), (4, "kiwi")), 2)
val y = sc.parallelize(List((5, "computer"),(1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
res23: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),
collect, toArray
Converts the RDD into a Scala array and returns it. Ifyou provide a standard map-function (i.e. f = T -> U) it will be appliedbefore inserting the values into the result array.
Listing Variants
def collect(): Array[T]
def collect[U: ClassTag](f: PartialFunction[T, U]):RDD[U]
def toArray(): Array[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu,Rat)
collectAsMap [Pair]
Similar to collect, but works on key-value RDDs andconverts them into Scala maps to preserve their key-value structure.
Listing Variants
def collectAsMap(): Map[K, V]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.zip(a)
res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1-> 1, 3 -> 3)
Very efficient implementation that combines the valuesof a RDD consisting of two-component tuples by applying multiple aggregatorsone after another.
Listing Variants
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, numPartitions:Int): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner:Partitioner, mapSideCombine: Boolean = true, serializerClass: String = null):RDD[(K, C)]
val a =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3)
val c = b.zip(a)
val d = c.combineByKey(List(_), (x:List[String],y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
res16: Array[(Int, List[String])] = Array((1,List(cat,dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))
Executes dependencies and computes the actualrepresentation of the RDD. This function should not be called directly byusers.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
context, sparkContext
Returns the SparkContext that was used to create theRDD.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res8: org.apache.spark.SparkContext =org.apache.spark.SparkContext@58c1c2f1
Returns the number of items stored within a RDD.
Listing Variants
def count(): Long
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res2: Long = 4
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def (timeout: Long, confidence: Double = 0.95):PartialResult[BoundedDouble]
countByKey [Pair]
Very similar to count, but counts the values of a RDDconsisting of two-component tuples for each distinct key separately.
Listing Variants
def countByKey(): Map[K, Long]
val c = sc.parallelize(List((3, "Gnu"), (3,"Yak"), (5, "Mouse"), (3, "Dog")), 2)
res3: scala.collection.Map[Int,Long] = Map(3 -> 3,5 -> 1)
countByKeyApprox [Pair]
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByKeyApprox(timeout: Long, confidence: Double= 0.95): PartialResult[Map[K, BoundedDouble]]
Returns a map that contains all unique values of theRDD and their respective occurrence counts. (Warning: This operation willfinally aggregate the information in a single reducer.)
Listing Variants
def countByValue(): Map[T, Long]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
res27: scala.collection.Map[Int,Long] = Map(5 -> 1,8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByValueApprox(timeout: Long, confidence:Double = 0.95): PartialResult[Map[T, BoundedDouble]]
Computes the approximate number of distinct values. Forlarge RDDs which are spread across many nodes, this function may execute fasterthan other counting methods. The parameter relativeSD controls the accuracy ofthe
Listing Variants
def countApproxDistinct(relativeSD: Double = 0.05):Long
val a = sc.parallelize(1 to 10000, 20)
val b = a++a++a++a++a
res14: Long = 10784
res15: Long = 11055
res16: Long = 10040
res0: Long = 10001
countApproxDistinctByKey [Pair]
Similar to countApproxDistinct, but computes theapproximate number of distinct values for each distinct key. Hence, the RDDmust consist of two-component tuples. For large RDDs which are spread acrossmany nodes, this
function may execute faster than other counting methods. Theparameter relativeSD controls the accuracy of the computation.
Listing Variants
def countApproxDistinctByKey(relativeSD: Double =0.05): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,numPartitions: Int): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,partitioner: Partitioner): RDD[(K, Long)]
val a = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
val b = sc.parallelize(a.takeSample(true, 10000, 0),20)
val c = sc.parallelize(1 to b.count().toInt, 20)
val d = b.zip(c)
res15: Array[(String, Long)] = Array((Rat,2567),(Cat,3357), (Dog,2414), (Gnu,2494))
res16: Array[(String, Long)] = Array((Rat,2555),(Cat,2455), (Dog,2425), (Gnu,2513))
res0: Array[(String, Long)] = Array((Rat,2562),(Cat,2464), (Dog,2451), (Gnu,2521))
Returns the RDD on which this RDD depends.
Listing Variants
final def dependencies: Seq[Dependency[_]]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
b: org.apache.spark.rdd.RDD[Int] =ParallelCollectionRDD[32] at parallelize at <console>:12
Int = 0
b.map(a => a).dependencies.length
res40: Int = 1
res41: Int = 2
res42: Seq[org.apache.spark.Dependency[_]] =List(org.apache.spark.rdd.CartesianRDD$$anon$1@576ddaaa, org.apache.spark.rdd.CartesianRDD$$anon$2@6d2efbbd)
Returns a new RDD that contains each unique value onlyonce.
Listing Variants
def distinct(): RDD[T]
def distinct(numPartitions: Int): RDD[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res6: Array[String] = Array(Dog, Gnu, Cat, Rat)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10))
res16: Int = 2
res17: Int = 3
Looks for the very first data item of the RDD andreturns it.
Listing Variants
def first(): T
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res1: String = Gnu
Evaluates a boolean function for each data item of theRDD and puts the items for which the function returned true into the resultingRDD.
Listing Variants
def filter(f: T => Boolean): RDD[T]
val a = sc.parallelize(1 to 10, 3)
a.filter(_ % 2 == 0)
res3: Array[Int] = Array(2, 4, 6, 8, 10)
When you provide a filter function, it must be able tohandle all data items contained in the RDD. Scala provides so-called partialfunctions to deal with mixed data-types. (Tip: Partial functions are veryuseful if you
have some data which may be bad and you do not want to handle butfor the good data (matching data) you want to apply some kind of map function.The following article is good. It teaches you about partial functions in a verynice way and explains why case has
to be used for partial functions: article)
Examples for mixed data without partial functions
val b = sc.parallelize(1 to 8)
b.filter(_ < 4).collect
res15: Array[Int] = Array(1, 2, 3)
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.filter(_ < 4).collect
<console>:15: error: value < is not a memberof Any
This fails because some components of a are notimplicitly comparable against integers. Collect uses the isDefinedAt propertyof a function-object to determine whether the test-function is compatible with eachdata item.
Only data items that pass this test (=filter) are then mapped usingthe function-object.
Examples for mixed data with partial functions
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.collect({case a: Int => "is integer" |
caseb: String => "is string" }).collect
res17: Array[String] = Array(is string, is string, isinteger, is string)
val myfunc: PartialFunction[Any, Any] = {
case a:Int => "is integer" |
case b: String=> "is string" }
res21: Boolean = true
res22: Boolean = true
res23: Boolean = false
Be careful! The above code works because it onlychecks the type itself! If you use operations on this type, you have to explicitlydeclare what type you want instead of any. Otherwise the compiler does(apparently) not
know what bytecode it should produce:
val myfunc2: PartialFunction[Any, Any] = {case x if (x< 4) => "x"}
<console>:10: error: value < is not a memberof Any
val myfunc2: PartialFunction[Int, Any] = {case x if (x< 4) => "x"}
myfunc2: PartialFunction[Int,Any] = <function1>
是filterwith的扩展版本,第一个参数是Int->T的形式,其中 Int代表的是partition的索引,T代表你要转换成的类型,第二个参数是(U,T)->boolean的形式,T是partition的索引,U是值
def filterWith[A: ClassTag](constructA: Int =>A)(p: (T, A) => Boolean): RDD[T]
val a = sc.parallelize(1 to 9, 3)
val b = a.filterWith(i => i)((x,i) => x % 2 == 0|| i % 2 == 0)
res37: Array[Int] = Array(1, 2, 3, 4, 6, 7, 8, 9)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10), 5)
a.filterWith(x=> x)((a, b) => b == 0).collect
res30: Array[Int] = Array(1, 2)
a.filterWith(x=> x)((a, b) => a % (b+1) == 0).collect
res33: Array[Int] = Array(1, 2, 4, 6, 8, 10)
a.filterWith(x=> x.toString)((a, b) => b == "2").collect
res34: Array[Int] = Array(5, 6)
Our research group has a very strong focus on usingand improving Apache Spark to solve real world programs. In order to do this weneed to have a very solid understanding of the capabilities of Spark. So one ofthe first
things we have done is to go through the entire Spark RDD API andwrite examples to test their functionality. This has been a very usefulexercise and we would like to share the examples with everyone.
Authors of examples: Matthias Langer and Zhen He
Emails addresses: m.langer@latrobe.edu.au,z.he@latrobe.edu.au
These examples have only been tested for Spark version0.9. We assume the functionality of Spark is stable and therefore the examplesshould be valid for later releases.
Here is a pdf of the all the examples: SparkExamples
The RDD API By Example
RDD is short for Resilient Distributed Dataset. RDDsare the workhorse of the Spark system. As a user, one can consider a RDD as ahandle for a collection of individual data partitions, which are the result ofsome computation.
However, an RDD is actually more than that. On clusterinstallations, separate data partitions can be on separate nodes. Using the RDDas a handle one can access all partitions and perform computations andtransformations
using the contained data. Whenever a part of a RDD or an entireRDD is lost, the system is able to reconstruct the data of lost partitions byusing lineage information. Lineage refers to the sequence of transformationsused to produce the current RDD. As a result,
Spark is able to recoverautomatically from most failures.
All RDDs available in Spark derive either directly orindirectly from the class RDD. This class comes with a large set of methods thatperform operations on the data within the associated partitions. The class RDDis
abstract. Whenever, one uses a RDD, one is actually using a concertizedimplementation of RDD. These implementations have to overwrite some corefunctions to make the RDD behave as expected.
One reason why Spark has lately become a very popularsystem for processing big data is that it does not impose restrictionsregarding what data can be stored within RDD partitions. The RDD API alreadycontains many useful
operations. But, because the creators of Spark had to keepthe core API of RDDs common enough to handle arbitrary data-types, manyconvenience functions are missing.
The basic RDD API considers each data item as a singlevalue. However, users often want to work with key-value pairs. Therefore Sparkextended the interface of RDD to provide additional functions(PairRDDFunctions), which
explicitly work on key-value pairs. Currently, thereare four extensions to the RDD API available in spark. They are as follows:
This extension contains many useful methods foraggregating numeric values. They become available if the data items of an RDDare implicitly convertible to the Scala data-type double.
Methods defined in this interface extension becomeavailable when the data items have a two component tuple structure. Spark willinterpret the first tuple item (i.e. tuplename. 1) as the key and the seconditem (i.e.
tuplename. 2) as the associated value.
Methods defined in this interface extension becomeavailable if the data items are two-component tuples where the key isimplicitly sortable.
This extension contains several methods that allowusers to create Hadoop sequence- les from RDDs. The data items must be twocompo- nent key-value tuples as required by the PairRDDFunctions. However,there are additional
requirements considering the convertibility of the tuplecomponents to Writable types.
Since Spark will make methods with extended functionalityautomatically available to users when the data items fulfill the abovedescribed requirements, we decided to list all possible available functions instrictly
alphabetical order. We will append either of the followingto thefunction-name to indicate it belongs to an extension that requires the dataitems to conform to a certain format or type.
[Double] - Double RDD Functions
[Ordered] - OrderedRDDFunctions
[Pair] - PairRDDFunctions
[SeqFile] - SequenceFileRDDFunctions
The aggregate-method provides an interface forperforming highly customized reductions and aggregations with a RDD. However,due to the way Scala and Spark execute and process data, care must be taken toachieve deterministic
behavior. The following list contains a few observationswe made while experimenting with aggregate:
The reduceand combine functions have to be commutative and associative.
As can beseen from the function definition below, the output of the combiner must beequal to its input. This is necessary because Spark will chain-execute it.
The zerovalue is the initial value of the U component when either seqOp or combOp areexecuted for the first element of their domain of influence. Depending on whatyou want to achieve, you may have to change it.
However, to make your codedeterministic, make sure that your code will yield the same result regardlessof the number or size of partitions.
Do notassume any execution order for either partition computations or combiningpartitions.
The neutralzeroValue is applied at the beginning of each sequence of reduces within theindividual partitions and again when the output of separate partitions iscombined.
Why have twoseparate combine functions? The first functions maps the input values into theresult space. Note that the aggregation data type (1st input and output) can bedifferent (U != T). The second function reduces
these mapped values in theresult space.
Why wouldone want to use two input data types? Let us assume we do an archaeologicalsite survey using a metal detector. While walking through the site we take GPScoordinates of important findings based on the output
of the metal detector.Later, we intend to draw an image of a map that highlights these locationsusing the aggregate function. In this case the zeroValue could be an area mapwith no highlights. The possibly huge set of input data is stored as GPScoordinates
across many partitions. seqOp could convert the GPS coordinates tomap coordinates and put a marker on the map at the respective position. combOpwill receive these highlights as partial maps and combine them into a singlefinal output map.
Listing Variants
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T)=> U, combOp: (U, U) => U): U
Examples 1
val z = sc.parallelize(List(1,2,3,4,5,6), 2)
z.aggregate(0)(math.max(_, _), _ + _)
res40: Int = 9
val z =sc.parallelize(List("a","b","c","d","e","f"),2)
z.aggregate("")(_ + _, _+_)
res115: String = abcdef
z.aggregate("x")(_ + _, _+_)
res116: String = xxdefxabc
val z =sc.parallelize(List("12","23","345","4567"),2)
z.aggregate("")((x,y) =>math.max(x.length, y.length).toString, (x,y) => x + y)
res141: String = 42
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res142: String = 11
val z =sc.parallelize(List("12","23","345",""),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res143: String = 10
The main issue with the code above is that the resultof the inner min is a string of length 1.
The zero in the output is due to the empty stringbeing the last string in the list. We see this result because we are notrecursively reducing any further within the partition for the final string.
Examples 2
val z =sc.parallelize(List("12","23","","345"),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res144: String = 11
In contrast to the previous example, this example hasthe empty string at the beginning of the second partition. This results inlength of zero being input to the second reduce which then upgrades it a lengthof 1. (Warning:
The above example shows bad design since the output isdependent on the order of the data inside the partitions.)
Computes the cartesian product between two RDDs (i.e.Each item of the first RDD is joined with each item of the second RDD) andreturns them as a new RDD. (Warning: Be careful when using this function.!Memory consumption
can quickly become an issue!)
Listing Variants
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)]
val x = sc.parallelize(List(1,2,3,4,5))
val y = sc.parallelize(List(6,7,8,9,10))
res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8),(1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9),(3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10))
Will create a checkpoint when the RDD is computednext. Checkpointed RDDs are stored as a binary file within the checkpointdirectory which can be specified using the Spark context. (Warning: Sparkapplies lazy evaluation.
Checkpointing will not occur until an action isinvoked.)
Important note: the directory "my_directory_name" should exist inall slaves. As an alternative you could use an HDFS directory URL as well.
Listing Variants
def checkpoint()
val a = sc.parallelize(1 to 4)
14/02/25 18:13:53 INFO SparkContext: Starting job:count at <console>:15
14/02/25 18:13:53 INFO MemoryStore: Block broadcast_5stored as values to memory (estimated size 115.7 KB, free 296.3 MB)
14/02/25 18:13:53 INFO RDDCheckpointData: Donecheckpointing RDD 11 tofile:/home/cloudera/Documents/spark-0.9.0-incubating-bin-cdh4/bin/my_directory_name/65407913-fdc6-4ec1-82c9-48a1656b95d6/rdd-11,new parent is RDD
res23: Long = 4
coalesce, repartition
Coalesces the associated data into a given number ofpartitions. repartition(numPartitions) is simply an abbreviation forcoalesce(numPartitions, shuffle = true).
Listing Variants
def coalesce ( numPartitions : Int , shuffle : Boolean= false ): RDD [T]
def repartition ( numPartitions : Int ): RDD [T]
val y = sc.parallelize(1 to 10, 10)
val z = y.coalesce(2, false)
res9: Int = 2
cogroup [Pair], groupWith [Pair]
A very powerful set of functions that allow groupingup to 3 key-value RDDs together using their keys.
Listing Variants
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def cogroup[W](other: RDD[(K, W)], numPartitions:Int): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def groupWith[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def groupWith[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.map((_, "b"))
val c = a.map((_, "c"))
res7: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
val d = a.map((_, "d"))
b.cogroup(c, d).collect
res9: Array[(Int, (Seq[String], Seq[String],Seq[String]))] = Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d,d)))
val x = sc.parallelize(List((1, "apple"),(2, "banana"), (3, "orange"), (4, "kiwi")), 2)
val y = sc.parallelize(List((5, "computer"),(1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
res23: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),
collect, toArray
Converts the RDD into a Scala array and returns it. Ifyou provide a standard map-function (i.e. f = T -> U) it will be appliedbefore inserting the values into the result array.
Listing Variants
def collect(): Array[T]
def collect[U: ClassTag](f: PartialFunction[T, U]):RDD[U]
def toArray(): Array[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu,Rat)
collectAsMap [Pair]
Similar to collect, but works on key-value RDDs andconverts them into Scala maps to preserve their key-value structure.
Listing Variants
def collectAsMap(): Map[K, V]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.zip(a)
res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1-> 1, 3 -> 3)
Very efficient implementation that combines the valuesof a RDD consisting of two-component tuples by applying multiple aggregatorsone after another.
Listing Variants
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, numPartitions:Int): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner:Partitioner, mapSideCombine: Boolean = true, serializerClass: String = null):RDD[(K, C)]
val a =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3)
val c = b.zip(a)
val d = c.combineByKey(List(_), (x:List[String],y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
res16: Array[(Int, List[String])] = Array((1,List(cat,dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))
Executes dependencies and computes the actualrepresentation of the RDD. This function should not be called directly byusers.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
context, sparkContext
Returns the SparkContext that was used to create theRDD.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res8: org.apache.spark.SparkContext =org.apache.spark.SparkContext@58c1c2f1
Returns the number of items stored within a RDD.
Listing Variants
def count(): Long
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res2: Long = 4
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def (timeout: Long, confidence: Double = 0.95):PartialResult[BoundedDouble]
countByKey [Pair]
Very similar to count, but counts the values of a RDDconsisting of two-component tuples for each distinct key separately.
Listing Variants
def countByKey(): Map[K, Long]
val c = sc.parallelize(List((3, "Gnu"), (3,"Yak"), (5, "Mouse"), (3, "Dog")), 2)
res3: scala.collection.Map[Int,Long] = Map(3 -> 3,5 -> 1)
countByKeyApprox [Pair]
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByKeyApprox(timeout: Long, confidence: Double= 0.95): PartialResult[Map[K, BoundedDouble]]
Returns a map that contains all unique values of theRDD and their respective occurrence counts. (Warning: This operation willfinally aggregate the information in a single reducer.)
Listing Variants
def countByValue(): Map[T, Long]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
res27: scala.collection.Map[Int,Long] = Map(5 -> 1,8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByValueApprox(timeout: Long, confidence:Double = 0.95): PartialResult[Map[T, BoundedDouble]]
Computes the approximate number of distinct values.For large RDDs which are spread across many nodes, this function may executefaster than other counting methods. The parameter relativeSD controls theaccuracy of the
Listing Variants
def countApproxDistinct(relativeSD: Double = 0.05):Long
val a = sc.parallelize(1 to 10000, 20)
val b = a++a++a++a++a
res14: Long = 10784
res15: Long = 11055
res16: Long = 10040
res0: Long = 10001
countApproxDistinctByKey [Pair]
Similar to countApproxDistinct, but computes theapproximate number of distinct values for each distinct key. Hence, the RDDmust consist of two-component tuples. For large RDDs which are spread acrossmany nodes, this
function may execute faster than other counting methods. Theparameter relativeSD controls the accuracy of the computation.
Listing Variants
def countApproxDistinctByKey(relativeSD: Double = 0.05):RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,numPartitions: Int): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,partitioner: Partitioner): RDD[(K, Long)]
val a = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
val b = sc.parallelize(a.takeSample(true, 10000, 0),20)
val c = sc.parallelize(1 to b.count().toInt, 20)
val d = b.zip(c)
res15: Array[(String, Long)] = Array((Rat,2567),(Cat,3357), (Dog,2414), (Gnu,2494))
res16: Array[(String, Long)] = Array((Rat,2555),(Cat,2455), (Dog,2425), (Gnu,2513))
res0: Array[(String, Long)] = Array((Rat,2562),(Cat,2464), (Dog,2451), (Gnu,2521))
Returns the RDD on which this RDD depends.
Listing Variants
final def dependencies: Seq[Dependency[_]]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
b: org.apache.spark.rdd.RDD[Int] =ParallelCollectionRDD[32] at parallelize at <console>:12
Int = 0
b.map(a => a).dependencies.length
res40: Int = 1
res41: Int = 2
res42: Seq[org.apache.spark.Dependency[_]] =List(org.apache.spark.rdd.CartesianRDD$$anon$1@576ddaaa,org.apache.spark.rdd.CartesianRDD$$anon$2@6d2efbbd)
Returns a new RDD that contains each unique value onlyonce.
Listing Variants
def distinct(): RDD[T]
def distinct(numPartitions: Int): RDD[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res6: Array[String] = Array(Dog, Gnu, Cat, Rat)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10))
res16: Int = 2
res17: Int = 3
Looks for the very first data item of the RDD andreturns it.
Listing Variants
def first(): T
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res1: String = Gnu
Evaluates a boolean function for each data item of theRDD and puts the items for which the function returned true into the resultingRDD.
Listing Variants
def filter(f: T => Boolean): RDD[T]
val a = sc.parallelize(1 to 10, 3)
a.filter(_ % 2 == 0)
res3: Array[Int] = Array(2, 4, 6, 8, 10)
When you provide a filter function, it must be able tohandle all data items contained in the RDD. Scala provides so-called partialfunctions to deal with mixed data-types. (Tip: Partial functions are veryuseful if you
have some data which may be bad and you do not want to handle butfor the good data (matching data) you want to apply some kind of map function.The following article is good. It teaches you about partial functions in a verynice way and explains why case has
to be used for partial functions: article)
Examples for mixed data without partial functions
val b = sc.parallelize(1 to 8)
b.filter(_ < 4).collect
res15: Array[Int] = Array(1, 2, 3)
val a = sc.parallelize(List("cat", "horse",4.0, 3.5, 2, "dog"))
a.filter(_ < 4).collect
<console>:15: error: value < is not a memberof Any
This fails because some components of a are notimplicitly comparable against integers. Collect uses the isDefinedAt propertyof a function-object to determine whether the test-function is compatible witheach data item.
Only data items that pass this test (=filter) are then mappedusing the function-object.
Examples for mixed data with partial functions
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.collect({case a: Int => "is integer" |
caseb: String => "is string" }).collect
res17: Array[String] = Array(is string, is string, isinteger, is string)
val myfunc: PartialFunction[Any, Any] = {
case a:Int => "is integer" |
case b: String=> "is string" }
res21: Boolean = true
res22: Boolean = true
res23: Boolean = false
Be careful! The above code works because it onlychecks the type itself! If you use operations on this type, you have toexplicitly declare what type you want instead of any. Otherwise the compilerdoes (apparently) not
know what bytecode it should produce:
val myfunc2: PartialFunction[Any, Any] = {case x if (x< 4) => "x"}
<console>:10: error: value < is not a memberof Any
val myfunc2: PartialFunction[Int, Any] = {case x if (x< 4) => "x"}
myfunc2: PartialFunction[Int,Any] = <function1>
This is an extended version of filter. It takes twofunction arguments. The first argument must conform to Int -> T and isexecuted once per partition. It will transform the partition index to type T.The second function
looks like (U, T) -> Boolean. T is the transformedpartition index and U are the data items from the RDD. Finally the function hasto return either true or false (i.e. Apply the filter).
Listing Variants
def filterWith[A: ClassTag](constructA: Int =>A)(p: (T, A) => Boolean): RDD[T]
val a = sc.parallelize(1 to 9, 3)
val b = a.filterWith(i => i)((x,i) => x % 2 == 0|| i % 2 == 0)
res37: Array[Int] = Array(1, 2, 3, 4, 6, 7, 8, 9)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10), 5)
a.filterWith(x=> x)((a, b) => b == 0).collect
res30: Array[Int] = Array(1, 2)
a.filterWith(x=> x)((a, b) => a % (b+1) == 0).collect
res33: Array[Int] = Array(1, 2, 4, 6, 8, 10)
a.filterWith(x=> x.toString)((a, b) => b == "2").collect
res34: Array[Int] = Array(5, 6)
def flatMap[U: ClassTag](f: T => TraversableOnce[U]):RDD[U]
val a = sc.parallelize(1 to 10, 5)
a.flatMap(1 to _).collect
res47: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3,4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7,8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
sc.parallelize(List(1, 2, 3), 2).flatMap(x =>List(x, x, x)).collect
res85: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)
// The program below generates a random number ofcopies (up to 10) of the items in the list.
val x =sc.parallelize(1 to 10, 3)
res1: Array[Int] = Array(1, 2, 3, 3, 3, 4, 4, 4, 4, 4,4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9,9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10)
def flatMapValues[U](f: V => TraversableOnce[U]):RDD[(K, U)]
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
b.flatMapValues("x" + _ +"x").collect
res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o),(3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l),(4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p),(7,a), (7,n),
(7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g),(5,l), (5,e), (5,x))
Our research group has a very strong focus on usingand improving Apache Spark to solve real world programs. In order to do this weneed to have a very solid understanding of the capabilities of Spark. So one ofthe first
things we have done is to go through the entire Spark RDD API andwrite examples to test their functionality. This has been a very usefulexercise and we would like to share the examples with everyone.
Authors of examples: Matthias Langer and Zhen He
Emails addresses: m.langer@latrobe.edu.au,z.he@latrobe.edu.au
These examples have only been tested for Spark version0.9. We assume the functionality of Spark is stable and therefore the examplesshould be valid for later releases.
Here is a pdf of the all the examples: SparkExamples
The RDD API By Example
RDD is short for Resilient Distributed Dataset. RDDsare the workhorse of the Spark system. As a user, one can consider a RDD as ahandle for a collection of individual data partitions, which are the result ofsome computation.
However, an RDD is actually more than that. On clusterinstallations, separate data partitions can be on separate nodes. Using the RDDas a handle one can access all partitions and perform computations andtransformations
using the contained data. Whenever a part of a RDD or an entireRDD is lost, the system is able to reconstruct the data of lost partitions byusing lineage information. Lineage refers to the sequence of transformationsused to produce the current RDD. As a result,
Spark is able to recoverautomatically from most failures.
All RDDs available in Spark derive either directly orindirectly from the class RDD. This class comes with a large set of methodsthat perform operations on the data within the associated partitions. The classRDD is
abstract. Whenever, one uses a RDD, one is actually using a concertizedimplementation of RDD. These implementations have to overwrite some corefunctions to make the RDD behave as expected.
One reason why Spark has lately become a very popularsystem for processing big data is that it does not impose restrictionsregarding what data can be stored within RDD partitions. The RDD API alreadycontains many useful
operations. But, because the creators of Spark had to keepthe core API of RDDs common enough to handle arbitrary data-types, manyconvenience functions are missing.
The basic RDD API considers each data item as a singlevalue. However, users often want to work with key-value pairs. Therefore Sparkextended the interface of RDD to provide additional functions(PairRDDFunctions), which
explicitly work on key-value pairs. Currently, thereare four extensions to the RDD API available in spark. They are as follows:
This extension contains many useful methods foraggregating numeric values. They become available if the data items of an RDDare implicitly convertible to the Scala data-type double.
Methods defined in this interface extension becomeavailable when the data items have a two component tuple structure. Spark willinterpret the first tuple item (i.e. tuplename. 1) as the key and the seconditem (i.e.
tuplename. 2) as the associated value.
Methods defined in this interface extension becomeavailable if the data items are two-component tuples where the key isimplicitly sortable.
This extension contains several methods that allowusers to create Hadoop sequence- les from RDDs. The data items must be two compo-nent key-value tuples as required by the PairRDDFunctions. However, there areadditional
requirements considering the convertibility of the tuple componentsto Writable types.
Since Spark will make methods with extendedfunctionality automatically available to users when the data items fulfill theabove described requirements, we decided to list all possible availablefunctions in strictly
alphabetical order. We will append either of thefollowingto the function-name to indicate it belongs to an extension thatrequires the data items to conform to a certain format or type.
[Double] - Double RDD Functions
[Ordered] - OrderedRDDFunctions
[Pair] - PairRDDFunctions
[SeqFile] - SequenceFileRDDFunctions
The aggregate-method provides an interface forperforming highly customized reductions and aggregations with a RDD. However,due to the way Scala and Spark execute and process data, care must be taken toachieve deterministic
behavior. The following list contains a few observationswe made while experimenting with aggregate:
The reduceand combine functions have to be commutative and associative.
As can beseen from the function definition below, the output of the combiner must beequal to its input. This is necessary because Spark will chain-execute it.
The zerovalue is the initial value of the U component when either seqOp or combOp areexecuted for the first element of their domain of influence. Depending on whatyou want to achieve, you may have to change it.
However, to make your codedeterministic, make sure that your code will yield the same result regardlessof the number or size of partitions.
Do notassume any execution order for either partition computations or combiningpartitions.
The neutralzeroValue is applied at the beginning of each sequence of reduces within theindividual partitions and again when the output of separate partitions iscombined.
Why have twoseparate combine functions? The first functions maps the input values into theresult space. Note that the aggregation data type (1st input and output) can bedifferent (U != T). The second function reduces
these mapped values in theresult space.
Why wouldone want to use two input data types? Let us assume we do an archaeologicalsite survey using a metal detector. While walking through the site we take GPScoordinates of important findings based on the output
of the metal detector.Later, we intend to draw an image of a map that highlights these locationsusing the aggregate function. In this case the zeroValue could be an area mapwith no highlights. The possibly huge set of input data is stored as GPScoordinates
across many partitions. seqOp could convert the GPS coordinates tomap coordinates and put a marker on the map at the respective position. combOpwill receive these highlights as partial maps and combine them into a singlefinal output map.
Listing Variants
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T)=> U, combOp: (U, U) => U): U
Examples 1
val z = sc.parallelize(List(1,2,3,4,5,6), 2)
z.aggregate(0)(math.max(_, _), _ + _)
res40: Int = 9
val z =sc.parallelize(List("a","b","c","d","e","f"),2)
z.aggregate("")(_ + _, _+_)
res115: String = abcdef
z.aggregate("x")(_ + _, _+_)
res116: String = xxdefxabc
val z = sc.parallelize(List("12","23","345","4567"),2)
z.aggregate("")((x,y) =>math.max(x.length, y.length).toString, (x,y) => x + y)
res141: String = 42
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res142: String = 11
val z = sc.parallelize(List("12","23","345",""),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res143: String = 10
The main issue with the code above is that the resultof the inner min is a string of length 1.
The zero in the output is due to the empty stringbeing the last string in the list. We see this result because we are notrecursively reducing any further within the partition for the final string.
Examples 2
val z =sc.parallelize(List("12","23","","345"),2)
z.aggregate("")((x,y) =>math.min(x.length, y.length).toString, (x,y) => x + y)
res144: String = 11
In contrast to the previous example, this example hasthe empty string at the beginning of the second partition. This results inlength of zero being input to the second reduce which then upgrades it a lengthof 1. (Warning:
The above example shows bad design since the output isdependent on the order of the data inside the partitions.)
Computes the cartesian product between two RDDs (i.e.Each item of the first RDD is joined with each item of the second RDD) andreturns them as a new RDD. (Warning: Be careful when using this function.!Memory consumption
can quickly become an issue!)
Listing Variants
def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)]
val x = sc.parallelize(List(1,2,3,4,5))
val y = sc.parallelize(List(6,7,8,9,10))
res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8),(1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9),(3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10))
Will create a checkpoint when the RDD is computednext. Checkpointed RDDs are stored as a binary file within the checkpointdirectory which can be specified using the Spark context. (Warning: Sparkapplies lazy evaluation.
Checkpointing will not occur until an action isinvoked.)
Important note: the directory "my_directory_name" should exist inall slaves. As an alternative you could use an HDFS directory URL as well.
Listing Variants
def checkpoint()
val a = sc.parallelize(1 to 4)
14/02/25 18:13:53 INFO SparkContext: Starting job:count at <console>:15
14/02/25 18:13:53 INFO MemoryStore: Block broadcast_5stored as values to memory (estimated size 115.7 KB, free 296.3 MB)
14/02/25 18:13:53 INFO RDDCheckpointData: Donecheckpointing RDD 11 tofile:/home/cloudera/Documents/spark-0.9.0-incubating-bin-cdh4/bin/my_directory_name/65407913-fdc6-4ec1-82c9-48a1656b95d6/rdd-11,new parent is RDD
res23: Long = 4
coalesce, repartition
Coalesces the associated data into a given number ofpartitions. repartition(numPartitions) is simply an abbreviation forcoalesce(numPartitions, shuffle = true).
Listing Variants
def coalesce ( numPartitions : Int , shuffle : Boolean= false ): RDD [T]
def repartition ( numPartitions : Int ): RDD [T]
val y = sc.parallelize(1 to 10, 10)
val z = y.coalesce(2, false)
res9: Int = 2
cogroup [Pair], groupWith [Pair]
A very powerful set of functions that allow groupingup to 3 key-value RDDs together using their keys.
Listing Variants
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def cogroup[W](other: RDD[(K, W)], numPartitions:Int): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (Seq[V], Seq[W]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def cogroup[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)], partitioner: Partitioner): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
def groupWith[W](other: RDD[(K, W)]): RDD[(K, (Seq[V],Seq[W]))]
def groupWith[W1, W2](other1: RDD[(K, W1)], other2:RDD[(K, W2)]): RDD[(K, (Seq[V], Seq[W1], Seq[W2]))]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.map((_, "b"))
val c = a.map((_, "c"))
res7: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
val d = a.map((_, "d"))
b.cogroup(c, d).collect
res9: Array[(Int, (Seq[String], Seq[String],Seq[String]))] = Array(
(1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d,d)))
val x = sc.parallelize(List((1, "apple"),(2, "banana"), (3, "orange"), (4, "kiwi")), 2)
val y = sc.parallelize(List((5, "computer"),(1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
res23: Array[(Int, (Seq[String], Seq[String]))] =Array(
(1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),
collect, toArray
Converts the RDD into a Scala array and returns it. Ifyou provide a standard map-function (i.e. f = T -> U) it will be appliedbefore inserting the values into the result array.
Listing Variants
def collect(): Array[T]
def collect[U: ClassTag](f: PartialFunction[T, U]):RDD[U]
def toArray(): Array[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu,Rat)
collectAsMap [Pair]
Similar to collect, but works on key-value RDDs andconverts them into Scala maps to preserve their key-value structure.
Listing Variants
def collectAsMap(): Map[K, V]
val a = sc.parallelize(List(1, 2, 1, 3), 1)
val b = a.zip(a)
res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1-> 1, 3 -> 3)
Very efficient implementation that combines the valuesof a RDD consisting of two-component tuples by applying multiple aggregatorsone after another.
Listing Variants
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, numPartitions:Int): RDD[(K, C)]
def combineByKey[C](createCombiner: V => C,mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner:Partitioner, mapSideCombine: Boolean = true, serializerClass: String = null):RDD[(K, C)]
val a =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3)
val c = b.zip(a)
val d = c.combineByKey(List(_), (x:List[String],y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y)
res16: Array[(Int, List[String])] = Array((1,List(cat,dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))
Executes dependencies and computes the actualrepresentation of the RDD. This function should not be called directly byusers.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
context, sparkContext
Returns the SparkContext that was used to create theRDD.
Listing Variants
def compute(split: Partition, context: TaskContext):Iterator[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res8: org.apache.spark.SparkContext =org.apache.spark.SparkContext@58c1c2f1
Returns the number of items stored within a RDD.
Listing Variants
def count(): Long
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res2: Long = 4
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def (timeout: Long, confidence: Double = 0.95):PartialResult[BoundedDouble]
countByKey [Pair]
Very similar to count, but counts the values of a RDDconsisting of two-component tuples for each distinct key separately.
Listing Variants
def countByKey(): Map[K, Long]
val c = sc.parallelize(List((3, "Gnu"), (3,"Yak"), (5, "Mouse"), (3, "Dog")), 2)
res3: scala.collection.Map[Int,Long] = Map(3 -> 3,5 -> 1)
countByKeyApprox [Pair]
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByKeyApprox(timeout: Long, confidence: Double= 0.95): PartialResult[Map[K, BoundedDouble]]
Returns a map that contains all unique values of theRDD and their respective occurrence counts. (Warning: This operation willfinally aggregate the information in a single reducer.)
Listing Variants
def countByValue(): Map[T, Long]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
res27: scala.collection.Map[Int,Long] = Map(5 -> 1,8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)
Marked as experimental feature! Experimental featuresare currently not covered by this document!
Listing Variants
def countByValueApprox(timeout: Long, confidence:Double = 0.95): PartialResult[Map[T, BoundedDouble]]
Computes the approximate number of distinct values.For large RDDs which are spread across many nodes, this function may executefaster than other counting methods. The parameter relativeSD controls theaccuracy of the
Listing Variants
def countApproxDistinct(relativeSD: Double = 0.05):Long
val a = sc.parallelize(1 to 10000, 20)
val b = a++a++a++a++a
res14: Long = 10784
res15: Long = 11055
res16: Long = 10040
res0: Long = 10001
countApproxDistinctByKey [Pair]
Similar to countApproxDistinct, but computes theapproximate number of distinct values for each distinct key. Hence, the RDDmust consist of two-component tuples. For large RDDs which are spread acrossmany nodes, this
function may execute faster than other counting methods. Theparameter relativeSD controls the accuracy of the computation.
Listing Variants
def countApproxDistinctByKey(relativeSD: Double =0.05): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,numPartitions: Int): RDD[(K, Long)]
def countApproxDistinctByKey(relativeSD: Double,partitioner: Partitioner): RDD[(K, Long)]
val a = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
val b = sc.parallelize(a.takeSample(true, 10000, 0),20)
val c = sc.parallelize(1 to b.count().toInt, 20)
val d = b.zip(c)
res15: Array[(String, Long)] = Array((Rat,2567),(Cat,3357), (Dog,2414), (Gnu,2494))
res16: Array[(String, Long)] = Array((Rat,2555),(Cat,2455), (Dog,2425), (Gnu,2513))
res0: Array[(String, Long)] = Array((Rat,2562),(Cat,2464), (Dog,2451), (Gnu,2521))
Returns the RDD on which this RDD depends.
Listing Variants
final def dependencies: Seq[Dependency[_]]
val b =sc.parallelize(List(1,2,3,4,5,6,7,8,2,4,2,1,1,1,1,1))
b: org.apache.spark.rdd.RDD[Int] =ParallelCollectionRDD[32] at parallelize at <console>:12
Int = 0
b.map(a => a).dependencies.length
res40: Int = 1
res41: Int = 2
res42: Seq[org.apache.spark.Dependency[_]] =List(org.apache.spark.rdd.CartesianRDD$$anon$1@576ddaaa,org.apache.spark.rdd.CartesianRDD$$anon$2@6d2efbbd)
Returns a new RDD that contains each unique value onlyonce.
Listing Variants
def distinct(): RDD[T]
def distinct(numPartitions: Int): RDD[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
res6: Array[String] = Array(Dog, Gnu, Cat, Rat)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10))
res16: Int = 2
res17: Int = 3
Looks for the very first data item of the RDD andreturns it.
Listing Variants
def first(): T
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog"), 2)
res1: String = Gnu
Evaluates a boolean function for each data item of theRDD and puts the items for which the function returned true into the resultingRDD.
Listing Variants
def filter(f: T => Boolean): RDD[T]
val a = sc.parallelize(1 to 10, 3)
a.filter(_ % 2 == 0)
res3: Array[Int] = Array(2, 4, 6, 8, 10)
When you provide a filter function, it must be able tohandle all data items contained in the RDD. Scala provides so-called partialfunctions to deal with mixed data-types. (Tip: Partial functions are veryuseful if you
have some data which may be bad and you do not want to handle butfor the good data (matching data) you want to apply some kind of map function.The following article is good. It teaches you about partial functions in a verynice way and explains why case has
to be used for partial functions: article)
Examples for mixed data without partial functions
val b = sc.parallelize(1 to 8)
b.filter(_ < 4).collect
res15: Array[Int] = Array(1, 2, 3)
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.filter(_ < 4).collect
<console>:15: error: value < is not a memberof Any
This fails because some components of a are notimplicitly comparable against integers. Collect uses the isDefinedAt propertyof a function-object to determine whether the test-function is compatible witheach data item.
Only data items that pass this test (=filter) are then mappedusing the function-object.
Examples for mixed data with partial functions
val a = sc.parallelize(List("cat","horse", 4.0, 3.5, 2, "dog"))
a.collect({case a: Int => "is integer" |
caseb: String => "is string" }).collect
res17: Array[String] = Array(is string, is string, isinteger, is string)
val myfunc: PartialFunction[Any, Any] = {
case a:Int => "is integer" |
case b: String=> "is string" }
res21: Boolean = true
res22: Boolean = true
res23: Boolean = false
Be careful! The above code works because it onlychecks the type itself! If you use operations on this type, you have toexplicitly declare what type you want instead of any. Otherwise the compilerdoes (apparently) not
know what bytecode it should produce:
val myfunc2: PartialFunction[Any, Any] = {case x if (x< 4) => "x"}
<console>:10: error: value < is not a memberof Any
val myfunc2: PartialFunction[Int, Any] = {case x if (x< 4) => "x"}
myfunc2: PartialFunction[Int,Any] = <function1>
This is an extended version of filter. It takes twofunction arguments. The first argument must conform to Int -> T and isexecuted once per partition. It will transform the partition index to type T.The second function
looks like (U, T) -> Boolean. T is the transformedpartition index and U are the data items from the RDD. Finally the function hasto return either true or false (i.e. Apply the filter).
Listing Variants
def filterWith[A: ClassTag](constructA: Int =>A)(p: (T, A) => Boolean): RDD[T]
val a = sc.parallelize(1 to 9, 3)
val b = a.filterWith(i => i)((x,i) => x % 2 == 0|| i % 2 == 0)
res37: Array[Int] = Array(1, 2, 3, 4, 6, 7, 8, 9)
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10), 5)
a.filterWith(x=> x)((a, b) => b == 0).collect
res30: Array[Int] = Array(1, 2)
a.filterWith(x=> x)((a, b) => a % (b+1) == 0).collect
res33: Array[Int] = Array(1, 2, 4, 6, 8, 10)
a.filterWith(x=> x.toString)((a, b) => b == "2").collect
res34: Array[Int] = Array(5, 6)
Similar to map, but allows emitting more than one itemin the map function.
Listing Variants
def flatMap[U: ClassTag](f: T =>TraversableOnce[U]): RDD[U]
val a = sc.parallelize(1 to 10, 5)
a.flatMap(1 to _).collect
res47: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3,4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7,8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
sc.parallelize(List(1, 2, 3), 2).flatMap(x =>List(x, x, x)).collect
res85: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)
// The program below generates a random number ofcopies (up to 10) of the items in the list.
val x =sc.parallelize(1 to 10, 3)
res1: Array[Int] = Array(1, 2, 3, 3, 3, 4, 4, 4, 4, 4,4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9,9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10)
Very similar to mapValues, but collapses the inherentstructure of the values during mapping.
Listing Variants
def flatMapValues[U](f: V => TraversableOnce[U]):RDD[(K, U)]
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
b.flatMapValues("x" + _ +"x").collect
res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o),(3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l),(4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p),(7,a), (7,n),
(7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g),(5,l), (5,e), (5,x))
def flatMapWith[A: ClassTag, U: ClassTag](constructA:Int => A, preservesPartitioning: Boolean = false)(f: (T, A) => Seq[U]):RDD[U]
val a = sc.parallelize(List(1,2,3,4,5,6,7,8,9), 3)
a.flatMapWith(x => x, true)((x, y) => List(y,x)).collect
res58: Array[Int] = Array(0, 1, 0, 2, 0, 3, 1, 4, 1,5, 1, 6, 2, 7, 2, 8, 2, 9)
val a = sc.parallelize(List(1,2,3), 3)
a.fold(0)(_ + _)
res59: Int = 6
a.fold(1)(_ + _)
res59: Int = 10
a.fold(2)(_ + _)
res59: Int = 14
a.fold(1)(_ - _)
res59: Int = 4
a.fold(2)(_ - _)
res59: Int = 2
def foldByKey(zeroValue: V)(func: (V, V) => V):RDD[(K, V)]
def foldByKey(zeroValue: V, numPartitions: Int)(func:(V, V) => V): RDD[(K, V)]
def foldByKey(zeroValue: V, partitioner:Partitioner)(func: (V, V) => V): RDD[(K, V)]
val a = sc.parallelize(List("dog","cat", "owl", "gnu", "ant"), 2)
val b = a.map(x => (x.length, x))
b.foldByKey("")(_ + _).collect
res84: Array[(Int, String)] =Array((3,dogcatowlgnuant)
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
b.foldByKey("")(_ + _).collect
res85: Array[(Int, String)] = Array((4,lion),(3,dogcat), (7,panther), (5,tigereagle))
对每一个data item都执行这个方法
def foreach(f: T => Unit)
val c = sc.parallelize(List("cat","dog", "tiger", "lion", "gnu","crocodile", "ant", "whale", "dolphin","spider"), 3)
c.foreach(x => println(x + "s areyummy"))
lions are yummy
gnus are yummy
crocodiles are yummy
ants are yummy
whales are yummy
dolphins are yummy
spiders are yummy
def foreachPartition(f: Iterator[T] => Unit)
val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8,9), 3)
b.foreachPartition(x => println(x.reduce(_ + _)))
defforeachWith[A: ClassTag](constructA: Int => A)(f: (T, A) => Unit)
val a = sc.parallelize(1 to 9, 3)
a.foreachWith(i => i)((x,i) => if (x % 2 == 1&& i % 2 == 0) println(x) )
def getCheckpointFile: Option[String]
val a = sc.parallelize(1 to 500, 5)
val b = a++a++a++a++a
res49: Option[String] = None
res54: Option[String] = None
res57: Option[String] =Some(file:/home/cloudera/Documents/cb978ffb-a346-4820-b3ba-d56580787b20/rdd-40)
val a = sc.parallelize(1 to 100000, 2)
String = Disk Serialized 1x Replicated
java.lang.UnsupportedOperationException: Cannot changestorage level of an RDD after it was already assigned a level
def glom(): RDD[Array[T]]
val a = sc.parallelize(1 to 100, 3)
res8: Array[Array[Int]] = Array(Array(1, 2, 3, 4, 5, 6,7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,27, 28, 29, 30, 31, 32, 33), Array(34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,45, 46,
47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,65, 66), Array(67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100))
def groupBy[K: ClassTag](f: T => K): RDD[(K,Seq[T])]
def groupBy[K: ClassTag](f: T => K, numPartitions:Int): RDD[(K, Seq[T])]
def groupBy[K: ClassTag](f: T => K, p:Partitioner): RDD[(K, Seq[T])]
val a = sc.parallelize(1 to 9, 3)
a.groupBy(x => { if (x % 2 == 0) "even"else "odd" }).collect
res42: Array[(String, Seq[Int])] =Array((even,ArrayBuffer(2, 4, 6, 8)), (odd,ArrayBuffer(1, 3, 5, 7, 9)))
val a = sc.parallelize(1 to 9, 3)
def myfunc(a: Int) : Int =
a % 2
res3: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2,4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))
val a = sc.parallelize(1 to 9, 3)
def myfunc(a: Int) : Int =
a % 2
a.groupBy(x => myfunc(x), 3).collect //此处的3是指定partition
a.groupBy(myfunc(_), 1).collect
res7: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2,4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))
import org.apache.spark.Partitioner
class MyPartitioner extends Partitioner {
def numPartitions: Int = 2
def getPartition(key: Any): Int =
key match
casenull => 0
case key:Int => key % numPartitions
case _ => key.hashCode % numPartitions
override defequals(other: Any): Boolean =
other match
case h:MyPartitioner => true
case_ => false
val a = sc.parallelize(1 to 9, 3)
val p = new MyPartitioner()
val b = a.groupBy((x:Int) => { x }, p)
val c = b.mapWith(i => i)((a, b) => (b, a))
res42: Array[(Int, (Int, Seq[Int]))] =Array((0,(4,ArrayBuffer(4))), (0,(2,ArrayBuffer(2))), (0,(6,ArrayBuffer(6))),(0,(8,ArrayBuffer(8))), (1,(9,ArrayBuffer(9))), (1,(3,ArrayBuffer(3))),(1,(1,ArrayBuffer(1))), (1,(7,ArrayBuffer(7))),
groupByKey [Pair]:
def groupByKey(): RDD[(K, Seq[V])]
def groupByKey(numPartitions: Int): RDD[(K, Seq[V])]
def groupByKey(partitioner: Partitioner): RDD[(K,Seq[V])]
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "spider","eagle"), 2)
val b = a.keyBy(_.length)
res11: Array[(Int, Seq[String])] = Array((4,ArrayBuffer(lion)),(6,ArrayBuffer(spider)), (3,ArrayBuffer(dog, cat)), (5,ArrayBuffer(tiger,eagle)))
histogram [Double]:
def histogram(bucketCount: Int): Pair[Array[Double],Array[Long]]
def histogram(buckets: Array[Double], evenBuckets:Boolean = false): Array[Long]
val a = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1,7.4, 7.5, 7.6, 8.8, 9.0), 3)
res11: (Array[Double], Array[Long]) = (Array(1.1,2.68, 4.26, 5.84, 7.42, 9.0),Array(5, 0, 0, 1, 4))
val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3,5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
res18: (Array[Double], Array[Long]) = (Array(1.0, 2.5,4.0, 5.5, 7.0, 8.5, 10.0),Array(6, 0, 1, 1, 3, 4))
val a = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1,7.4, 7.5, 7.6, 8.8, 9.0), 3)
a.histogram(Array(0.0, 3.0, 8.0))
res14: Array[Long] = Array(5, 3)
val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3,5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
a.histogram(Array(0.0, 5.0, 10.0))
res1: Array[Long] = Array(6, 9)
a.histogram(Array(0.0, 5.0, 10.0, 15.0))
res1: Array[Long] = Array(6, 8, 1)
val y = sc.parallelize(1 to 10, 10)
res16: Int = 19
res6: Boolean = false
res8: Boolean = false
res9: Boolean = true
join [Pair]:
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
def join[W](other: RDD[(K, W)], numPartitions: Int):RDD[(K, (V, W))]
def join[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (V, W))]
val a = sc.parallelize(List("dog","salmon", "salmon", "rat", "elephant"),3)
val b = a.keyBy(_.length)
val c =
val d = c.keyBy(_.length)
res17: Array[(Int, (String, String))] =Array((6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)),(6,(rabbit,salmon)), (6,(rabbit,rabbit)), (6,(rabbit,turkey)),(6,(turkey,salmon)), (6,(turkey,rabbit)), (6,(turkey,turkey)),
(3,(dog,dog)),(3,(dog,cat)), (3,(dog,gnu)), (3,(dog,bee)), (3,(cat,dog)), (3,(cat,cat)), (3,(cat,gnu)),(3,(cat,bee)), (3,(gnu,dog)), (3,(gnu,cat)), (3,(gnu,gnu)), (3,(gnu,bee)),(3,(bee,dog)), (3,(bee,cat)), (3,(bee,gnu)), (3,(bee,bee)), (4,(wolf,wolf)),(4,(wolf,bear)),
(4,(bear,wolf)), (4,(bear,bear)))
def keyBy[K](f: T => K): RDD[(K, T)]
val a = sc.parallelize(List("dog","salmon", "salmon", "rat", "elephant"),3)
val b = a.keyBy(_.length)
res26: Array[(Int, String)] = Array((3,dog),(6,salmon), (6,salmon), (3,rat), (8,elephant))
keys [Pair]:
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
res2: Array[Int] = Array(3, 5, 4, 3, 7, 5)
leftOuterJoin [Pair]:
def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V,Option[W]))]
def leftOuterJoin[W](other: RDD[(K, W)],numPartitions: Int): RDD[(K, (V, Option[W]))]
def leftOuterJoin[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (V, Option[W]))]
val a = sc.parallelize(List("dog","salmon", "salmon", "rat", "elephant"),3)
val b = a.keyBy(_.length)
val c =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val d = c.keyBy(_.length)
res1: Array[(Int, (String, Option[String]))] =Array((6,(salmon,Some(salmon))), (6,(salmon,Some(rabbit))),(6,(salmon,Some(turkey))), (6,(salmon,Some(salmon))),(6,(salmon,Some(rabbit))), (6,(salmon,Some(turkey))), (3,(dog,Some(dog))),(3,(dog,Some(cat))),
(3,(dog,Some(gnu))), (3,(dog,Some(bee))),(3,(rat,Some(dog))), (3,(rat,Some(cat))), (3,(rat,Some(gnu))),(3,(rat,Some(bee))), (8,(elephant,None)))
def lookup(key: K): Seq[V]
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
res0: Seq[String] = WrappedArray(tiger, eagle)
def map[U: ClassTag](f: T => U): RDD[U]
val a = sc.parallelize(List("dog","salmon", "salmon", "rat", "elephant"),3)
val b = a.map(_.length)
val c = a.zip(b)
res0: Array[(String, Int)] = Array((dog,3),(salmon,6), (salmon,6), (rat,3), (elephant,8))
def mapPartitions[U: ClassTag](f: Iterator[T] =>Iterator[U], preservesPartitioning: Boolean = false): RDD[U]
Example 1
val a = sc.parallelize(1 to 9, 3)
def myfunc[T](iter: Iterator[T]) : Iterator[(T, T)] ={
var res =List[(T, T)]()
var pre =iter.next
val cur =iter.next;
res .::=(pre, cur)
pre = cur;
res0: Array[(Int, Int)] = Array((2,3), (1,2), (5,6),(4,5), (8,9), (7,8))
Example 2
val x = sc.parallelize(List("1","2", "3", "4", "5", "6","7", "8", "10"), 3)
def myfunc(iter: Iterator[Int]) : Iterator[Int] = {
var res =List[Int]()
while(iter.hasNext) {
val cur =iter.next;
res = res::: List.fill(scala.util.Random.nextInt(10))(cur)
// some of the number are not outputted at all. Thisis because the random number generated for it is zero.
res8: Array[Int] = Array(1, 2, 2, 2, 2, 3, 3, 3, 3, 3,3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 7, 7, 7, 9, 9, 10)
def mapPartitionsWithContext[U: ClassTag](f:(TaskContext, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean =false): RDD[U]
val x = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10), 3)
def myfunc(index: Int, iter: Iterator[Int]) :Iterator[String] = {
iter.toList.map(x => index + "," + x).iterator
res10: Array[String] = Array(0,1, 0,2, 0,3, 1,4, 1,5,1,6, 2,7, 2,8, 2,9, 2,10)
val x = sc.parallelize(List(1,2,3,4,5,6,7,8,9,10), 3)
def myfunc(index: Int, iter: Iterator[Int]) :Iterator[String] = {
iter.toList.map(x => index + "," + x).iterator
res10: Array[String] = Array(0,1, 0,2, 0,3, 1,4, 1,5,1,6, 2,7, 2,8, 2,9, 2,10)
def mapValues[U](f: V => U): RDD[(K, U)]
val a = sc.parallelize(List("dog", "tiger","lion", "cat", "panther", "eagle"), 2)
val b = a.map(x => (x.length, x))
b.mapValues("x" + _ + "x").collect
res5: Array[(Int, String)] = Array((3,xdogx),(5,xtigerx), (4,xlionx), (3,xcatx), (7,xpantherx), (5,xeaglex))
mean [Double] meanApprox [Double]:
def mean(): Double
def meanApprox(timeout: Long, confidence: Double =0.95): PartialResult[BoundedDouble]
val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3,5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
res0: Double = 5.3
name, setName:
@transient var name: String
def setName(_name: String)
val y = sc.parallelize(1 to 10, 10)
res13: String = null
y.setName("Fancy RDD Name")
res15: String = Fancy RDD Name
final def partitions: Array[Partition]
res1: Array[org.apache.spark.Partition] =Array(org.apache.spark.rdd.ParallelCollectionPartition@691,org.apache.spark.rdd.ParallelCollectionPartition@692,org.apache.spark.rdd.ParallelCollectionPartition@693)
def cache(): RDD[T]
def persist(): RDD[T]
def persist(newLevel: StorageLevel): RDD[T]
val c = sc.parallelize(List("Gnu","Cat", "Rat", "Dog", "Gnu","Rat"), 2)
scala> c.getStorageLevel
res0: org.apache.spark.storage.StorageLevel =StorageLevel(false, false, false, 1)
res2: org.apache.spark.storage.StorageLevel =StorageLevel(false, true, true, 1)
pipe :
def pipe(command: String): RDD[String]
def pipe(command: String, env: Map[String, String]):RDD[String]
def pipe(command: Seq[String], env: Map[String,String] = Map(), printPipeContext: (String => Unit) => Unit = null,printRDDElement: (T, String => Unit) => Unit = null): RDD[String]
val a = sc.parallelize(1 to 9, 3)
a.pipe("head -n 1").collect
res2: Array[String] = Array(1, 4, 7)
reduce :
def reduce(f: (T, T) => T): T
val a = sc.parallelize(1 to 100, 3)
a.reduce(_ + _)
res41: Int = 5050
reduceByKey [Pair], reduceByKeyLocally [Pair], reduceByKeyToDriver [Pair]:
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
def reduceByKey(func: (V, V) => V, numPartitions:Int): RDD[(K, V)]
def reduceByKey(partitioner: Partitioner, func: (V, V)=> V): RDD[(K, V)]
def reduceByKeyLocally(func: (V, V) => V): Map[K,V]
def reduceByKeyToDriver(func: (V, V) => V): Map[K,V]
val a = sc.parallelize(List("dog","cat", "owl", "gnu", "ant"), 2)
val b = a.map(x => (x.length, x))
b.reduceByKey(_ + _).collect
res86: Array[(Int, String)] =Array((3,dogcatowlgnuant))
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
b.reduceByKey(_ + _).collect
res87: Array[(Int, String)] = Array((4,lion), (3,dogcat),(7,panther), (5,tigereagle))
rightOuterJoin [Pair]:
def rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K,(Option[V], W))]
def rightOuterJoin[W](other: RDD[(K, W)],numPartitions: Int): RDD[(K, (Option[V], W))]
def rightOuterJoin[W](other: RDD[(K, W)], partitioner:Partitioner): RDD[(K, (Option[V], W))]
val a = sc.parallelize(List("dog","salmon", "salmon", "rat", "elephant"),3)
val b = a.keyBy(_.length)
val c =sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"),3)
val d = c.keyBy(_.length)
res2: Array[(Int, (Option[String], String))] =Array((6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)),(6,(Some(salmon),turkey)), (6,(Some(salmon),salmon)),(6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (3,(Some(dog),dog)),(3,(Some(dog),cat)),
(3,(Some(dog),gnu)), (3,(Some(dog),bee)),(3,(Some(rat),dog)), (3,(Some(rat),cat)), (3,(Some(rat),gnu)),(3,(Some(rat),bee)), (4,(None,wolf)), (4,(None,bear)))
def sample(withReplacement: Boolean, fraction: Double,seed: Int): RDD[T]
val a = sc.parallelize(1 to 10000, 3)
a.sample(false, 0.1, 0).count
res24: Long = 960
a.sample(true, 0.3, 0).count
res25: Long = 2888
a.sample(true, 0.3, 13).count
res26: Long = 2985
saveAsHadoopFile [Pair], saveAsHadoopDataset [Pair],saveAsNewAPIHadoopFile [Pair]:
def saveAsObjectFile(path: String)
val x = sc.parallelize(1 to 100, 3)
val y = sc.objectFile[Array[Int]]("objFile")
res52: Array[Int] = Array(67, 68, 69, 70, 71, 72, 73,74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,94, 95, 96, 97, 98, 99, 100, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33)
val v = sc.parallelize(Array(("owl",3),("gnu",4), ("dog",1), ("cat",2),("ant",5)), 2)
val a = sc.parallelize(1 to 10000, 3)
stats [Double]:
val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0,19.02, 19.29, 11.09, 21.0), 2)
res16: org.apache.spark.util.StatCounter = (count: 9,mean: 11.266667, stdev: 8.126859)
sortByKey [Ordered]:
def sortByKey(ascending: Boolean = true, numPartitions:Int = self.partitions.size): RDD[P]
val a = sc.parallelize(List("dog","cat", "owl", "gnu", "ant"), 2)
val b = sc.parallelize(1 to a.count.toInt, 2)
val c = a.zip(b)
res74: Array[(String, Int)] = Array((ant,5), (cat,2), (dog,1),(gnu,4), (owl,3))
res75: Array[(String, Int)] = Array((owl,3), (gnu,4),(dog,1), (cat,2), (ant,5))
val a = sc.parallelize(1 to 100, 5)
val b = a.cartesian(a)
val c = sc.parallelize(b.takeSample(true, 5, 13), 2)
val d = c.sortByKey(false)
res56: Array[(Int, Int)] = Array((96,9), (84,76),(59,59), (53,65), (52,4))
def subtract(other: RDD[T]): RDD[T]
def subtract(other: RDD[T], numPartitions: Int):RDD[T]
def subtract(other: RDD[T], p: Partitioner): RDD[T]
val a = sc.parallelize(1 to 9, 3)
val b = sc.parallelize(1 to 3, 3)
val c = a.subtract(b)
res3: Array[Int] = Array(6, 9, 4, 7, 5, 8)
def subtractByKey[W: ClassTag](other: RDD[(K, W)]):RDD[(K, V)]
def subtractByKey[W: ClassTag](other: RDD[(K, W)],numPartitions: Int): RDD[(K, V)]
def subtractByKey[W: ClassTag](other: RDD[(K, W)], p:Partitioner): RDD[(K, V)]
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "spider","eagle"), 2)
val b = a.keyBy(_.length)
val c = sc.parallelize(List("ant","falcon", "squid"), 2)
val d = c.keyBy(_.length)
res15: Array[(Int, String)] = Array((4,lion))
sum [Double], sumApprox [Double]:
def sum(): Double
def sumApprox(timeout: Long, confidence: Double =0.95): PartialResult[BoundedDouble]
val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0,19.02, 19.29, 11.09, 21.0), 2)
res17: Double = 101.39999999999999
def take(num: Int): Array[T]
val b = sc.parallelize(List("dog","cat", "ape", "salmon", "gnu"), 2)
res18: Array[String] = Array(dog, cat)
val b = sc.parallelize(1 to 10000, 5000)
res6: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)
def takeOrdered(num: Int)(implicit ord: Ordering[T]):Array[T]
val b = sc.parallelize(List("dog","cat", "ape", "salmon", "gnu"), 2)
res19: Array[String] = Array(ape, cat)
def takeSample(withReplacement: Boolean, num: Int,seed: Int): Array[T]
val x = sc.parallelize(1 to 1000, 3)
x.takeSample(true, 100, 1)
res3: Array[Int] = Array(339, 718, 810, 105, 71, 268,333, 360, 341, 300, 68, 848, 431, 449, 773, 172, 802, 339, 431, 285, 937, 301,167, 69, 330, 864, 40, 645, 65, 349, 613, 468, 982, 314, 160, 675, 232, 794,577, 571,
805, 317, 136, 860, 522, 45, 628, 178, 321, 482, 657, 114, 332, 728,901, 290, 175, 876, 227, 130, 863, 773, 559, 301, 694, 460, 839, 952, 664, 851,260, 729, 823, 880, 792, 964, 614, 821, 683, 364, 80, 875, 813, 951, 663, 344,546, 918, 436, 451, 397, 670, 756,
512, 391, 70, 213, 896, 123, 858)
def toDebugString: String
val a = sc.parallelize(1 to 9, 3)
val b = sc.parallelize(1 to 3, 3)
val c = a.subtract(b)
res6: String =
MappedRDD[15] at subtract at <console>:16 (3partitions)
SubtractedRDD[14]at subtract at <console>:16 (3 partitions)
MappedRDD[12] at subtract at <console>:16 (3 partitions)
ParallelCollectionRDD[10] at parallelize at <console>:12 (3partitions)
MappedRDD[13] at subtract at <console>:16 (3 partitions)
ParallelCollectionRDD[11] at parallelize at <console>:12 (3partitions)
def toJavaRDD() : JavaRDD[T]
ddef top(num: Int)(implicit ord: Ordering[T]):Array[T]
val c = sc.parallelize(Array(6, 9, 4, 7, 5, 8), 2)
res28: Array[Int] = Array(9, 8)
override def toString: String
val a = sc.parallelize(1 to 9, 3)
val b = sc.parallelize(1 to 3, 3)
val c = a.subtract(b)
res7: String = MappedRDD[15] at subtract at<console>:16
A 联合B ,类似数据库中的union
def ++(other: RDD[T]): RDD[T]
def union(other: RDD[T]): RDD[T]
val a = sc.parallelize(1 to 3, 1)
val b = sc.parallelize(5 to 7, 1)
(a ++ b).collect
res0: Array[Int] = Array(1, 2, 3, 5, 6, 7)
def unpersist(blocking: Boolean = true): RDD[T]
val y = sc.parallelize(1 to 10, 10)
val z = (y++y)
14/04/19 03:04:57 INFO UnionRDD: Removing RDD 22 frompersistence list
14/04/19 03:04:57 INFO BlockManager: Removing RDD 22
val a = sc.parallelize(List("dog","tiger", "lion", "cat", "panther","eagle"), 2)
val b = a.map(x => (x.length, x))
res3: Array[String] = Array(dog, tiger, lion, cat,panther, eagle)
variance [Double], sampleVariance [Double]:
sampleVariance :样本方差
val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3,5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)a.varianceres70: Double =10.605333333333332 val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02,19.29,
11.09, 21.0), 2)x.varianceres14: Double = 66.04584444444443x.sampleVarianceres13: Double = 74.30157499999999
def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)]
val a = sc.parallelize(1 to 100, 3)val b =sc.parallelize(101 to 200, 3)a.zip(b).collectres1: Array[(Int, Int)] =Array((1,101), (2,102), (3,103), (4,104), (5,105), (6,106), (7,107), (8,108),(9,109), (10,110), (11,111),
(12,112), (13,113), (14,114), (15,115), (16,116),(17,117), (18,118), (19,119), (20,120), (21,121), (22,122), (23,123), (24,124),(25,125), (26,126), (27,127), (28,128), (29,129), (30,130), (31,131), (32,132),(33,133), (34,134), (35,135), (36,136), (37,137),
(38,138), (39,139), (40,140),(41,141), (42,142), (43,143), (44,144), (45,145), (46,146), (47,147), (48,148),(49,149), (50,150), (51,151), (52,152), (53,153), (54,154), (55,155), (56,156),(57,157), (58,158), (59,159), (60,160), (61,161), (62,162), (63,163),
(64,164),(65,165), (66,166), (67,167), (68,168), (69,169), (70,170), (71,171), (72,172),(73,173), (74,174), (75,175), (76,176), (77,177), (78,... val a =sc.parallelize(1 to 100, 3)val b = sc.parallelize(101 to 200, 3)val c =sc.parallelize(201 to 300, 3)a.zip(b).zip(c).map((x)
=> (x._1._1, x._1._2,x._2 )).collectres12: Array[(Int, Int, Int)] = Array((1,101,201), (2,102,202),(3,103,203), (4,104,204), (5,105,205), (6,106,206), (7,107,207), (8,108,208),(9,109,209), (10,110,210), (11,111,211), (12,112,212), (13,113,213), (14,114,214),(15,115,215),
(16,116,216), (17,117,217), (18,118,218), (19,119,219),(20,120,220), (21,121,221), (22,122,222), (23,123,223), (24,124,224),(25,125,225), (26,126,226), (27,127,227), (28,128,228), (29,129,229),(30,130,230), (31,131,231), (32,132,232), (33,133,233), (34,134,234),(35,135,235),
(36,136,236), (37,137,237), (38,138,238), (39,139,239),(40,140,240), (41,141,241), (42,142,242), (43,143,243), (44,144,244),(45,145,245), (46,146,246), (47,147,247), (48,148,248), (49,149,249),(50,150,250), (51,151,251), (52,152,252), (53,153,253), (54,154,254),(55,155,255)...
def zipPartitions[B: ClassTag, V: ClassTag](rdd2:RDD[B])(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V]defzipPartitions[B: ClassTag, V: ClassTag](rdd2: RDD[B], preservesPartitioning:Boolean)(f: (Iterator[T],
Iterator[B]) => Iterator[V]): RDD[V]defzipPartitions[B: ClassTag, C: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3:RDD[C])(f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V]defzipPartitions[B: ClassTag, C: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3:RDD[C],
preservesPartitioning: Boolean)(f: (Iterator[T], Iterator[B],Iterator[C]) => Iterator[V]): RDD[V]def zipPartitions[B: ClassTag, C:ClassTag, D: ClassTag, V: ClassTag](rdd2: RDD[B], rdd3: RDD[C], rdd4:RDD[D])(f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D])
=>Iterator[V]): RDD[V]def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V:ClassTag](rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean)(f:(Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V]
val a = sc.parallelize(0 to 9, 3)val b =sc.parallelize(10 to 19, 3)val c = sc.parallelize(100 to 109, 3)defmyfunc(aiter: Iterator[Int], biter: Iterator[Int], citer: Iterator[Int]):Iterator[String] ={ var res =List[String]()
while (aiter.hasNext&& biter.hasNext && citer.hasNext) { val x = aiter.next + " " + biter.next + " " +citer.next res ::= x } res.iterator}a.zipPartitions(b,c)(myfunc).collectres50: Array[String] = Array(2 12 102, 1 11 101, 0 10 100, 515 105, 4 14 104,
3 13 103, 9 19 109, 8 18 108, 7 17 107, 6 16 106)
作者:china_demon 发表于2016/8/1 4:29:35