Skip to content

Commit 014dda9

Browse files
committedJan 12, 2015
Make chunk size an argument. Default is -chunk_bits 24 -> 16MB.
1 parent 03b6a73 commit 014dda9

File tree

7 files changed

+51
-38
lines changed

7 files changed

+51
-38
lines changed
 

Diff for: ‎src/main/java/water/H2O.java

+13
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ public final class H2O {
4646
// Max. number of factor levels ber column (before flipping all to NAs)
4747
public static int DATA_MAX_FACTOR_LEVELS = 65000;
4848

49+
public static int LOG_CHK = 24; // Chunks are 1<<24, or 16Meg
50+
4951
// The multicast discovery port
5052
static MulticastSocket CLOUD_MULTICAST_SOCKET;
5153
static NetworkInterface CLOUD_MULTICAST_IF;
@@ -713,6 +715,7 @@ public static class OptArgs extends Arguments.Opt {
713715
public String version = null;
714716
public String single_precision = null;
715717
public int data_max_factor_levels;
718+
public int chunk_bits;
716719
public String beta = null;
717720
public String mem_watchdog = null; // For developer debugging
718721
public boolean md5skip = false;
@@ -765,6 +768,10 @@ public static void printHelp() {
765768
" from double to single precision to save memory of numerical data.\n" +
766769
" (The default is double precision.)\n" +
767770
"\n" +
771+
" -chunk_bits <integer>\n" +
772+
" The number of bits per chunk.\n" +
773+
" (The default is " + LOG_CHK + ", which is " + PrettyPrint.bytes(1<<LOG_CHK) + ".)\n" +
774+
"\n" +
768775
" -data_max_factor_levels <integer>\n" +
769776
" The maximum number of factor levels for categorical columns.\n" +
770777
" Columns with more than the specified number of factor levels\n" +
@@ -918,6 +925,12 @@ public static void main( String[] args ) {
918925
Log.info("Max. number of factor levels per column: " + DATA_MAX_FACTOR_LEVELS);
919926
}
920927

928+
if (OPT_ARGS.chunk_bits != 0) {
929+
if (OPT_ARGS.chunk_bits > 0)
930+
LOG_CHK = OPT_ARGS.chunk_bits;
931+
}
932+
Log.info("Chunk size: " + PrettyPrint.bytes(1<<LOG_CHK));
933+
921934
// Get ice path before loading Log or Persist class
922935
String ice = DEFAULT_ICE_ROOT();
923936
if( OPT_ARGS.ice_root != null ) ice = OPT_ARGS.ice_root.replace("\\", "/");

Diff for: ‎src/main/java/water/fvec/FileVec.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
public abstract class FileVec extends ByteVec {
66
long _len; // File length
77
final byte _be;
8-
public static final int CHUNK_SZ = 1 << LOG_CHK;
8+
public static final int CHUNK_SZ = 1 << H2O.LOG_CHK;
99

1010

1111
protected FileVec(Key key, long len, byte be) {
@@ -15,7 +15,7 @@ protected FileVec(Key key, long len, byte be) {
1515
}
1616

1717
@Override public long length() { return _len; }
18-
@Override public int nChunks() { return (int)Math.max(1,_len>>LOG_CHK); }
18+
@Override public int nChunks() { return (int)Math.max(1,_len>>H2O.LOG_CHK); }
1919
@Override public boolean writable() { return false; }
2020

2121
//NOTE: override ALL rollups-related methods or ALL files will be loaded after import.
@@ -46,7 +46,7 @@ protected FileVec(Key key, long len, byte be) {
4646
@Override
4747
public int elem2ChunkIdx(long i) {
4848
assert 0 <= i && i <= _len : " "+i+" < "+_len;
49-
int cidx = (int)(i>>LOG_CHK);
49+
int cidx = (int)(i>>H2O.LOG_CHK);
5050
int nc = nChunks();
5151
if( i >= _len ) return nc;
5252
if( cidx >= nc ) cidx=nc-1; // Last chunk is larger
@@ -56,9 +56,9 @@ public int elem2ChunkIdx(long i) {
5656
// Convert a chunk-index into a starting row #. Constant sized chunks
5757
// (except for the last, which might be a little larger), and size-1 rows so
5858
// this is a little shift-n-add math.
59-
@Override public long chunk2StartElem( int cidx ) { return (long)cidx <<LOG_CHK; }
59+
@Override public long chunk2StartElem( int cidx ) { return (long)cidx <<H2O.LOG_CHK; }
6060
// Convert a chunk-key to a file offset. Size 1 rows, so this is a direct conversion.
61-
static public long chunkOffset ( Key ckey ) { return (long)chunkIdx(ckey)<<LOG_CHK; }
61+
static public long chunkOffset ( Key ckey ) { return (long)chunkIdx(ckey)<<H2O.LOG_CHK; }
6262
// Reverse: convert a chunk-key into a cidx
6363
static public int chunkIdx(Key ckey) { assert ckey._kb[0]==Key.DVEC; return UDP.get4(ckey._kb,1+1+4); }
6464

Diff for: ‎src/main/java/water/fvec/NewChunk.java

+6-3
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,8 @@ void append2( long l, int x ) {
336336

337337
// Slow-path append data
338338
private void append2slowd() {
339-
if( _sparseLen > Vec.CHUNK_SZ )
339+
final int CHUNK_SZ = 1 << H2O.LOG_CHK;
340+
if( _sparseLen > CHUNK_SZ )
340341
throw new ArrayIndexOutOfBoundsException(_sparseLen);
341342
assert _ls==null;
342343
if(_ds != null && _ds.length > 0){
@@ -355,7 +356,8 @@ private void append2slowd() {
355356
}
356357
// Slow-path append data
357358
private void append2slowUUID() {
358-
if( _sparseLen > Vec.CHUNK_SZ )
359+
final int CHUNK_SZ = 1 << H2O.LOG_CHK;
360+
if( _sparseLen > CHUNK_SZ )
359361
throw new ArrayIndexOutOfBoundsException(_sparseLen);
360362
if( _ds==null && _ls!=null ) { // This can happen for columns with all NAs and then a UUID
361363
_xs=null;
@@ -374,7 +376,8 @@ private void append2slowUUID() {
374376
}
375377
// Slow-path append data
376378
private void append2slow( ) {
377-
if( _sparseLen > Vec.CHUNK_SZ )
379+
final int CHUNK_SZ = 1 << H2O.LOG_CHK;
380+
if( _sparseLen > CHUNK_SZ )
378381
throw new ArrayIndexOutOfBoundsException(_sparseLen);
379382
assert _ds==null;
380383
if(_ls != null && _ls.length > 0){

Diff for: ‎src/main/java/water/fvec/UploadFileVec.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public void close(C1NChunk c, int cidx, Futures fs) {
2525
assert _len==-1; // Not closed
2626
c._vec = this; // Attach chunk to this vec.
2727
DKV.put(chunkKey(cidx),c,fs); // Write updated chunk back into K/V
28-
_len = ((_nchunks-1L)<<LOG_CHK)+c._len;
28+
_len = ((_nchunks-1L)<<H2O.LOG_CHK)+c._len;
2929
}
3030

3131
@Override public Value chunkIdx( int cidx ) {

Diff for: ‎src/main/java/water/fvec/Vec.java

+3-7
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,6 @@
4545
* @author Cliff Click
4646
*/
4747
public class Vec extends Iced {
48-
/** Log-2 of Chunk size. */
49-
public static final int LOG_CHK = 24; // Chunks are 1<<24, or 16Meg
50-
/** Chunk size. Bigger increases batch sizes, lowers overhead costs, lower
51-
* increases fine-grained parallelism. */
52-
public static final int CHUNK_SZ = 1 << LOG_CHK;
5348

5449
/** Key mapping a Value which holds this Vec. */
5550
final public Key _key; // Top-level key
@@ -224,10 +219,11 @@ public void map(Chunk[] cs) {
224219
}.doAll(makeConSeq(0, len)).vecs(0);
225220
}
226221
public static Vec makeConSeq(double x, long len) {
227-
int chunks = (int)Math.ceil((double)len / Vec.CHUNK_SZ);
222+
final int CHUNK_SZ = 1 << H2O.LOG_CHK;
223+
int chunks = (int)Math.ceil((double)len / CHUNK_SZ);
228224
long[] espc = new long[chunks+1];
229225
for (int i = 1; i<=chunks; ++i)
230-
espc[i] = Math.min(espc[i-1] + Vec.CHUNK_SZ, len);
226+
espc[i] = Math.min(espc[i-1] + CHUNK_SZ, len);
231227
return new Vec(VectorGroup.VG_LEN1.addVec(), espc).makeCon(x);
232228
}
233229

Diff for: ‎src/main/java/water/util/Utils.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -817,7 +817,7 @@ public static Compression guessCompressionMethod(byte [] bits){
817817
break;
818818
off += len;
819819
if( off == bs.length ) { // Dataset is uncompressing alot! Need more space...
820-
if( bs.length >= water.fvec.Vec.CHUNK_SZ )
820+
if( bs.length >= (1 << H2O.LOG_CHK))
821821
break; // Already got enough
822822
bs = Arrays.copyOf(bs, bs.length * 2);
823823
}

Diff for: ‎src/test/java/water/fvec/VecTest.java

+22-21
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
/** This test tests stability of Vec API. */
1818
public class VecTest extends TestUtil {
19+
final int CHUNK_SZ = 1 << H2O.LOG_CHK;
1920

2021
/** Test toEnum call to return correct domain. */
2122
@Test public void testToEnum() {
@@ -75,62 +76,62 @@ private static final void testChangeDomainImpl(){
7576
@Test public void testMakeConSeq() {
7677
Vec v;
7778

78-
v = makeConSeq(0xCAFE,Vec.CHUNK_SZ);
79+
v = makeConSeq(0xCAFE,CHUNK_SZ);
7980
assertTrue(v.at(234) == 0xCAFE);
8081
assertTrue(v._espc.length == 2);
8182
assertTrue(
8283
v._espc[0] == 0 &&
83-
v._espc[1] == Vec.CHUNK_SZ
84+
v._espc[1] == CHUNK_SZ
8485
);
8586
v.remove(new Futures()).blockForPending();
8687

87-
v = makeConSeq(0xCAFE,2*Vec.CHUNK_SZ);
88+
v = makeConSeq(0xCAFE,2*CHUNK_SZ);
8889
assertTrue(v.at(234) == 0xCAFE);
89-
assertTrue(v.at(2*Vec.CHUNK_SZ-1) == 0xCAFE);
90+
assertTrue(v.at(2*CHUNK_SZ-1) == 0xCAFE);
9091
assertTrue(v._espc.length == 3);
9192
assertTrue(
9293
v._espc[0] == 0 &&
93-
v._espc[1] == Vec.CHUNK_SZ &&
94-
v._espc[2] == Vec.CHUNK_SZ*2
94+
v._espc[1] == CHUNK_SZ &&
95+
v._espc[2] == CHUNK_SZ*2
9596
);
9697
v.remove(new Futures()).blockForPending();
9798

98-
v = makeConSeq(0xCAFE,2*Vec.CHUNK_SZ+1);
99+
v = makeConSeq(0xCAFE,2*CHUNK_SZ+1);
99100
assertTrue(v.at(234) == 0xCAFE);
100-
assertTrue(v.at(2*Vec.CHUNK_SZ) == 0xCAFE);
101+
assertTrue(v.at(2*CHUNK_SZ) == 0xCAFE);
101102
assertTrue(v._espc.length == 4);
102103
assertTrue(
103104
v._espc[0] == 0 &&
104-
v._espc[1] == Vec.CHUNK_SZ &&
105-
v._espc[2] == Vec.CHUNK_SZ*2 &&
106-
v._espc[3] == Vec.CHUNK_SZ*2+1
105+
v._espc[1] == CHUNK_SZ &&
106+
v._espc[2] == CHUNK_SZ*2 &&
107+
v._espc[3] == CHUNK_SZ*2+1
107108
);
108109
v.remove(new Futures()).blockForPending();
109110

110-
v = makeConSeq(0xCAFE,3*Vec.CHUNK_SZ);
111+
v = makeConSeq(0xCAFE,3*CHUNK_SZ);
111112
assertTrue(v.at(234) == 0xCAFE);
112-
assertTrue(v.at(3*Vec.CHUNK_SZ-1) == 0xCAFE);
113+
assertTrue(v.at(3*CHUNK_SZ-1) == 0xCAFE);
113114
assertTrue(v._espc.length == 4);
114115
assertTrue(
115116
v._espc[0] == 0 &&
116-
v._espc[1] == Vec.CHUNK_SZ &&
117-
v._espc[2] == Vec.CHUNK_SZ*2 &&
118-
v._espc[3] == Vec.CHUNK_SZ*3
117+
v._espc[1] == CHUNK_SZ &&
118+
v._espc[2] == CHUNK_SZ*2 &&
119+
v._espc[3] == CHUNK_SZ*3
119120
);
120121
v.remove(new Futures()).blockForPending();
121122
}
122123
// Test HEX-1819
123124
@Test public void testMakeSeq() {
124-
Vec v = makeSeq(3*Vec.CHUNK_SZ);
125+
Vec v = makeSeq(3*CHUNK_SZ);
125126
assertTrue(v.at(0) == 1);
126127
assertTrue(v.at(234) == 235);
127-
assertTrue(v.at(2*Vec.CHUNK_SZ) == 2*Vec.CHUNK_SZ+1);
128+
assertTrue(v.at(2*CHUNK_SZ) == 2*CHUNK_SZ+1);
128129
assertTrue(v._espc.length == 4);
129130
assertTrue(
130131
v._espc[0] == 0 &&
131-
v._espc[1] == Vec.CHUNK_SZ &&
132-
v._espc[2] == Vec.CHUNK_SZ * 2 &&
133-
v._espc[3] == Vec.CHUNK_SZ * 3
132+
v._espc[1] == CHUNK_SZ &&
133+
v._espc[2] == CHUNK_SZ * 2 &&
134+
v._espc[3] == CHUNK_SZ * 3
134135
);
135136
v.remove(new Futures()).blockForPending();
136137
}

0 commit comments

Comments
 (0)
Please sign in to comment.