一、cache和persisit的对比
-rw-r--r--@ 1 hadoop staff 68M 5 17 07:04 access.log
cache/persitence是 lazy的,延迟加载 unpersitence是立即执行的
@DeveloperApi
class StorageLevel private(
private var _usedisk: Boolean,
private var _useMemory: Boolean,
private var _uSEOffheap: Boolean,
private var _deserialized: Boolean,
private var _replication: Int = 1)
extends Externalizable { }
/**
* VarIoUs [[org.apache.spark.storage.StorageLevel]] defined and utility functions for creating
* new storage levels.
*/
object StorageLevel {
val NONE = new StorageLevel(false, false, false, false)
val disK_ONLY = new StorageLevel(true, false, false, false)
val disK_ONLY_2 = new StorageLevel(true, false, false, false, 2)
val MEMORY_ONLY = new StorageLevel(false, true, false, true)
val MEMORY_ONLY_2 = new StorageLevel(false, true, false, true, 2)
val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false)
val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, false, 2)
val MEMORY_AND_disK = new StorageLevel(true, true, false, true)
val MEMORY_AND_disK_2 = new StorageLevel(true, true, false, true, 2)
val MEMORY_AND_disK_SER = new StorageLevel(true, true, false, false)
val MEMORY_AND_disK_SER_2 = new StorageLevel(true, true, false, false, 2)
val OFF_HEAP = new StorageLevel(true, true, true, false, 1)
/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)
/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def cache(): this.type = persist()
/**
* Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.
*
* @param blocking Whether to block until all blocks are deleted.
* @return This RDD.
*/
def unpersist(blocking: Boolean = true): this.type = {
logInfo("Removing RDD " + id + " from persistence list")
sc.unpersistRDD(id, blocking)
storageLevel = StorageLevel.NONE
this
}
/** Get the RDD's current storage level, or StorageLevel.NONE if none is set. */
def getStorageLevel: StorageLevel = storageLevel
二、序列化测试Java和kyro
序列化: 默认java序列化类User 使用kyro序列化没有未注册类User 使用kryo序列化并注册类User
默认java序列化类User
import scala.collection.mutable.ListBuffer
class User(id:Int,username:String,age:String) extends Serializable
val users = new ListBuffer[User]
for(i <- 1 to 1000000){
users.+=(new User(i,"name"+i,i.toString))
}
val usersRDD=sc.parallelize(users)
import org.apache.spark.storage.StorageLevel
usersRDD.persist(StorageLevel.MEMORY_ONLY_SER)
usersRDD.foreach(println(_))
使用kyro序列化没有未注册类User
import org.apache.spark.SparkConf
val sparkConf= new SparkConf()
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
import org.apache.spark.SparkContext
使用kryo序列化并注册类User
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[User]))
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。