@@ -2,6 +2,7 @@ package com.springml.spark.salesforce
2
2
3
3
import com .sforce .soap .partner .{SaveResult , Connector , PartnerConnection }
4
4
import com .sforce .ws .ConnectorConfig
5
+ import com .madhukaraphatak .sizeof .SizeEstimator
5
6
import org .apache .log4j .Logger
6
7
import org .apache .spark .rdd .RDD
7
8
import org .apache .spark .sql .Row
@@ -10,7 +11,7 @@ import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}
10
11
/**
11
12
* Created by madhu on 9/7/15.
12
13
*/
13
- object Utils extends Serializable {
14
+ object Utils extends Serializable {
14
15
15
16
16
17
private def fieldJson (fieldName: String ,datasetName: String ) = {
@@ -70,18 +71,41 @@ object Utils extends Serializable{
70
71
})
71
72
}
72
73
73
- def repartition (rdd : RDD [Row ]): RDD [Row ] = {
74
+ def repartition (rdd : RDD [Row ]): RDD [Row ] = {
75
+ val totalDataSize = getTotalSize(rdd)
76
+ val maxBundleSize = 1024 * 1024 * 10l ;
77
+ var partitions = 1
78
+ if (totalDataSize > maxBundleSize) {
79
+ partitions = Math .round(totalDataSize / maxBundleSize) + 1
80
+ }
74
81
75
- val NO_OF_ROWS_PARTITION = 500
76
- val totalRows = rdd.count()
77
- val partitions = Math .round(totalRows / NO_OF_ROWS_PARTITION ) + 1
78
- // val noPartitions = Math.max(rdd.partitions.length, partititons)
79
82
val shuffle = rdd.partitions.length < partitions
80
83
rdd.coalesce(partitions.toInt, shuffle)
81
84
}
82
85
86
+ def getTotalSize (rdd : RDD [Row ]): Long = {
87
+ // This can be fetched as optional parameter
88
+ val NO_OF_SAMPLE_ROWS = 10l ;
89
+ val totalRows = rdd.count();
90
+ var totalSize = 0l
91
+ if (totalRows > NO_OF_SAMPLE_ROWS ) {
92
+ val sampleRDD = rdd.sample(true , NO_OF_SAMPLE_ROWS )
93
+ val sampleRDDSize = getRDDSize(sampleRDD)
94
+ totalSize = sampleRDDSize.* (totalRows)./ (NO_OF_SAMPLE_ROWS )
95
+ } else {
96
+ totalSize = getRDDSize(rdd)
97
+ }
98
+
99
+ totalSize
100
+ }
83
101
84
-
85
-
86
-
102
+ def getRDDSize (rdd : RDD [Row ]) : Long = {
103
+ var rddSize = 0l
104
+ val rows = rdd.collect()
105
+ for (i <- 0 until rows.length) {
106
+ rddSize += SizeEstimator .estimate(rows.apply(i).toSeq.map { value => value.toString() }.mkString(" ," ))
107
+ }
108
+
109
+ rddSize
110
+ }
87
111
}
0 commit comments