twitter · jcoveney · Apr 15, 2014 · Apr 14, 2014 · Apr 15, 2014 · jcoveney
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/AdaptiveMatrix.scala
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala b/algebird-core/src/main/scala/com/twitter/algebird/SketchMap.scala
@@ -17,6 +17,7 @@ limitations under the License.
 package com.twitter.algebird
 
 import scala.collection.breakOut
+import com.twitter.algebird.matrix.AdaptiveMatrix
 
 /**
  * A Sketch Map is a generalized version of the Count-Min Sketch that is an
@@ -159,8 +160,8 @@ case class SketchMapParams[K](seed: Int, width: Int, depth: Int, heavyHittersCou
   def frequency[V:Ordering](key: K, table: AdaptiveMatrix[V]): V =
     hashes
       .view
-      .zip(table.rowsByColumns)
-      .map { case (hash, row) => row(hash(key)) }
+      .zipWithIndex
+      .map { case (hash, row) => table.getValue(row, hash(key)) }
       .min
 
   /**

diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/AdaptiveMatrix.scala
@@ -0,0 +1,120 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package com.twitter.algebird.matrix
+
+import scala.collection.mutable.{ArrayBuffer, Map => MMap}
+import com.twitter.algebird.{AdaptiveVector, Monoid}
+
+/**
+ * A Matrix structure that is designed to hide moving between sparse and dense representations
+ * Initial support here is focused on a dense row count with a sparse set of columns
+ */
+
+abstract class AdaptiveMatrix[V: Monoid] {
+  def rows: Int
+  def cols: Int
+  def size = rows * cols
+
+  def getValue(position: (Int, Int)): V
+
+  def updateInto(buffer: ArrayBuffer[V]): Unit
+
+  def updated(position: (Int, Int), value: V): AdaptiveMatrix[V]
+}
+
+object AdaptiveMatrix {
+  def zero[V: Monoid](rows: Int, cols: Int) = fill(rows, cols)(implicitly[Monoid[V]].zero)
+
+  def fill[V: Monoid](rows: Int, cols: Int)(fill: V): AdaptiveMatrix[V] = {
+    SparseColumnMatrix(Vector.fill(rows)(AdaptiveVector.fill[V](cols)(fill)))
+  }
+
+  def empty[V: Monoid](): AdaptiveMatrix[V] = {
+    SparseColumnMatrix(IndexedSeq[AdaptiveVector[V]]())
+  }
+
+  // The adaptive monoid to swap between sparse modes.
+  implicit def monoid[V:Monoid]: Monoid[AdaptiveMatrix[V]] = new Monoid[AdaptiveMatrix[V]] {
+    private[this] final val innerZero = implicitly[Monoid[V]].zero
+
+    override def zero: AdaptiveMatrix[V] = SparseColumnMatrix[V](IndexedSeq[AdaptiveVector[V]]())
+
+    override def plus(a: AdaptiveMatrix[V], b: AdaptiveMatrix[V]) = sumOption(List(a, b)).get
+
+    private def denseInsert(rows: Int, cols: Int, buff: ArrayBuffer[V], remainder: Iterator[AdaptiveMatrix[V]]): Option[AdaptiveMatrix[V]] = {
+      remainder.foreach(_.updateInto(buff))
+      Some(DenseMatrix(rows, cols, buff))
+    }
+
+    private def denseUpdate(current: AdaptiveMatrix[V], remainder: Iterator[AdaptiveMatrix[V]]): Option[AdaptiveMatrix[V]] = {
+      val rows = current.rows
+      val cols = current.cols
+      val buffer = ArrayBuffer.fill(rows * cols)(innerZero)
+      current.updateInto(buffer)
+      denseInsert(rows, cols, buffer, remainder)
+    }
+
+    private def sparseUpdate(storage: IndexedSeq[MMap[Int, V]], other: SparseColumnMatrix[V]) = {
+      other.rowsByColumns.zipWithIndex.foreach { case (contents, indx) =>
+        val curMap: MMap[Int, V] = storage(indx)
+        AdaptiveVector.toMap(contents).foreach { case (col, value) =>
+          curMap.update(col, Monoid.plus(value, curMap.getOrElse(col, innerZero)))
+        }
+      }
+    }
+
+    private def goDense(rows: Int, cols: Int, storage: IndexedSeq[MMap[Int, V]], remainder: Iterator[AdaptiveMatrix[V]]): Option[AdaptiveMatrix[V]] = {
+      val buffer = ArrayBuffer.fill(rows * cols)(innerZero)
+      var row = 0
+      val iter = storage.iterator
+      while(iter.hasNext) {
+        val curRow = iter.next
+        curRow.foreach { case (col, value) =>
+          buffer(row*cols + col) = value
+        }
+        row += 1
+      }
+      denseInsert(rows, cols, buffer, remainder)
+    }
+
+  override def sumOption(items: TraversableOnce[AdaptiveMatrix[V]]): Option[AdaptiveMatrix[V]] =
+    if(items.isEmpty) {
+      None
+    } else {
+      val iter = items.toIterator.buffered
+      val rows = iter.head.rows
+      val cols = iter.head.cols
+      val sparseStorage = (0 until rows).map{_ => MMap[Int, V]()}.toIndexedSeq
+
+      while(iter.hasNext) {
+        val current = iter.next
+        current match {
+          case d@DenseMatrix(_, _, _) => return denseUpdate(d, iter)
+          case s@SparseColumnMatrix(_) =>
+            sparseUpdate(sparseStorage, s)
+            if(sparseStorage(0).size > current.cols/4) {
+              return goDense(rows, cols, sparseStorage, iter)
+            }
+        }
+      }
+
+      // Need to still be sparse to reach here, so must unpack the MMap to be used again.
+      Some(SparseColumnMatrix.fromSeqMap(cols, sparseStorage))
+    }
+  }
+}
+
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/DenseMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/DenseMatrix.scala
@@ -0,0 +1,43 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package com.twitter.algebird.matrix
+import scala.collection.mutable.{ArrayBuffer, Map => MMap}
+
+import com.twitter.algebird.Monoid
+
+case class DenseMatrix[V: Monoid](rows: Int, cols: Int, rowsByColumns: IndexedSeq[V]) extends AdaptiveMatrix[V] {
+  val valueMonoid = implicitly[Monoid[V]]
+
+  private[this] def tupToIndex(position: (Int, Int)) = position._1 * cols + position._2
+
+  override def getValue(position: (Int, Int)): V = rowsByColumns(tupToIndex(position))
+
+  override def updated(position: (Int, Int), value: V): DenseMatrix[V] =
+    DenseMatrix[V](rows, cols, rowsByColumns.updated(tupToIndex(position), value))
+
+
+  override def updateInto(buffer: ArrayBuffer[V]) {
+    var indx = 0
+    val lsize = size
+    while(indx < lsize) {
+      buffer(indx) = valueMonoid.plus(buffer(indx), rowsByColumns(indx))
+      indx += 1
+    }
+  }
+
+}
+
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala b/algebird-core/src/main/scala/com/twitter/algebird/matrix/SparseColumnMatrix.scala
@@ -0,0 +1,65 @@
+/*
+Copyright 2012 Twitter, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package com.twitter.algebird.matrix
+import scala.collection.mutable.{ArrayBuffer, Map => MMap}
+import com.twitter.algebird.{Monoid, AdaptiveVector}
+
+object SparseColumnMatrix {
+  def fromSeqMap[V: Monoid](cols: Int, data: IndexedSeq[MMap[Int, V]]) = {
+    val monoidZero = implicitly[Monoid[V]].zero
+    SparseColumnMatrix(data.map { mm =>
+      AdaptiveVector.fromMap(mm.toMap, monoidZero, cols)
+    }.toIndexedSeq)
+  }
+}
+
+case class SparseColumnMatrix[V: Monoid](rowsByColumns: IndexedSeq[AdaptiveVector[V]]) extends AdaptiveMatrix[V] {
+  /** Row is the outer Seq, the columns are the inner vectors. */
+
+  val valueMonoid = implicitly[Monoid[V]]
+
+  override def rows: Int = rowsByColumns.size
+
+  override def cols: Int = rowsByColumns(0).size
+
+  def getValue(position: (Int, Int)): V = rowsByColumns(position._1)(position._2)
+
+  def updated(position: (Int, Int), value: V): SparseColumnMatrix[V] = {
+    val (row, col) = position
+    SparseColumnMatrix[V](rowsByColumns.updated(row, rowsByColumns(row).updated(col, value)))
+  }
+
+  override def updateInto(buffer: ArrayBuffer[V]) {
+    val lcols = cols
+    var row = 0
+    while(row < rows) {
+      val iter = rowsByColumns(row).denseIterator
+      while(iter.hasNext) {
+        val (col, value) = iter.next
+        val indx = row * lcols + col
+        buffer(indx) = valueMonoid.plus(buffer(indx), value)
+      }
+      row += 1
+    }
+  }
+
+  def toDense: DenseMatrix[V] = {
+    val buf = ArrayBuffer.fill(size)(valueMonoid.zero)
+    updateInto(buf)
+    DenseMatrix(rows, cols, buf)
+  }
+}