Repartition is now done based on size

samuel-pt · samuel-pt · commit 1948fe2a4a21 · 2015-07-22T14:18:31.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@
 .classpath
 .project
 .settings/
+.cache-main
+.cache-tests
 target/
 project/target
 dependency-reduced-pom.xml
+/bin/
diff --git a/build.sbt b/build.sbt
@@ -21,7 +21,7 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori
 resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
 
 libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"
-
+libraryDependencies += "com.madhukaraphatak" %% "java-sizeof" % "0.1"
 
 
 //unmanagedJars in Compile += file("lib/partner.jar")
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,2 +1,3 @@
 addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")
 addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")
diff --git a/src/main/scala/com/springml/spark/salesforce/Utils.scala b/src/main/scala/com/springml/spark/salesforce/Utils.scala
@@ -2,6 +2,7 @@ package com.springml.spark.salesforce
 
 import com.sforce.soap.partner.{SaveResult, Connector, PartnerConnection}
 import com.sforce.ws.ConnectorConfig
+import com.madhukaraphatak.sizeof.SizeEstimator
 import org.apache.log4j.Logger
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
@@ -10,7 +11,7 @@ import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
 /**
  * Created by madhu on 9/7/15.
  */
-object Utils extends Serializable{
+object Utils extends Serializable {
 
 
   private def fieldJson(fieldName:String,datasetName:String) = {
@@ -70,18 +71,41 @@ object Utils extends Serializable{
     })
   }
 
-   def repartition(rdd: RDD[Row]): RDD[Row] = {
+  def repartition(rdd: RDD[Row]): RDD[Row] = {
+    val totalDataSize = getTotalSize(rdd)
+    val maxBundleSize = 1024 * 1024 * 10l;
+    var partitions = 1
+    if (totalDataSize > maxBundleSize) {
+      partitions = Math.round(totalDataSize / maxBundleSize) + 1
+    }
 
-    val NO_OF_ROWS_PARTITION = 500
-    val totalRows = rdd.count()
-    val partitions = Math.round(totalRows / NO_OF_ROWS_PARTITION) + 1
-    //val noPartitions = Math.max(rdd.partitions.length, partititons)
     val shuffle = rdd.partitions.length < partitions
     rdd.coalesce(partitions.toInt, shuffle)
   }
 
+  def getTotalSize(rdd: RDD[Row]): Long = {
+    // This can be fetched as optional parameter
+    val NO_OF_SAMPLE_ROWS = 10l;
+    val totalRows = rdd.count();
+    var totalSize = 0l
+    if (totalRows > NO_OF_SAMPLE_ROWS) {
+      val sampleRDD = rdd.sample(true, NO_OF_SAMPLE_ROWS)
+      val sampleRDDSize = getRDDSize(sampleRDD)
+      totalSize = sampleRDDSize.*(totalRows)./(NO_OF_SAMPLE_ROWS)
+    } else {
+      totalSize = getRDDSize(rdd)
+    }
+    
+    totalSize
+  }
 
-
-
-
+  def getRDDSize(rdd: RDD[Row]) : Long = {
+      var rddSize = 0l
+      val rows = rdd.collect()
+      for (i <- 0 until rows.length) {
+         rddSize += SizeEstimator.estimate(rows.apply(i).toSeq.map { value => value.toString() }.mkString(","))
+      }
+    
+      rddSize
+  }
 }
diff --git a/src/test/scala/com/springml/spark/salesforce/TestUtils.scala b/src/test/scala/com/springml/spark/salesforce/TestUtils.scala
@@ -43,7 +43,7 @@ class TestUtils extends FunSuite {
 
     val repartitionDF = Utils.repartition(inMemoryRDD)
 
-    assert(repartitionDF.partitions.length == 5)
+    assert(repartitionDF.partitions.length >= 30)
 
 
   }

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.2")`
`2`	`2`	`addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")`
	`3`	`+addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0")`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ class TestUtils extends FunSuite {`
`43`	`43`
`44`	`44`	`val repartitionDF = Utils.repartition(inMemoryRDD)`
`45`	`45`
`46`		`- assert(repartitionDF.partitions.length == 5)`
	`46`	`+ assert(repartitionDF.partitions.length >= 30)`
`47`	`47`
`48`	`48`
`49`	`49`	`}`