Add NSQ as a stream source (closes #64)

aldemirenes · aldemirenes · commit d2845d84b2a3 · 2017-09-13T19:39:25.000+03:00
diff --git a/build.sbt b/build.sbt
@@ -30,6 +30,8 @@ lazy val root = project.in(file("."))
       Dependencies.Libraries.elephantbird,
       Dependencies.Libraries.hadoopLZO,
       Dependencies.Libraries.jodaTime,
+      Dependencies.Libraries.nsqClient,
+      Dependencies.Libraries.jacksonCbor,
       // Scala
       Dependencies.Libraries.scopt,
       Dependencies.Libraries.config,
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -14,6 +14,7 @@ import sbt._
 
 object Dependencies {
   val resolvers = Seq(
+    Resolver.jcenterRepo,
     "Snowplow Analytics Maven releases repo" at "http://maven.snplow.com/releases/",
     "Twitter maven repo"                     at "http://maven.twttr.com/"
   )
@@ -28,6 +29,8 @@ object Dependencies {
     val hadoopLZO        = "0.4.20"
     val jodaTime         = "2.9.9"
     val config           = "1.3.1"
+    val nsqClient        = "1.1.0-rc1"
+    val jacksonCbor      = "2.8.8"
     // Thrift (test only)
     val collectorPayload = "0.0.0"
     // Scala
@@ -42,8 +45,11 @@ object Dependencies {
   object Libraries {
     // Java
     val slf4j            = "org.slf4j"                 %  "slf4j-simple"              % V.slf4j
-    val kinesisClient    = "com.amazonaws"             %  "amazon-kinesis-client"     % V.kinesisClient
-    val kinesisConnector = "com.amazonaws"             %  "amazon-kinesis-connectors" % V.kinesisConnector
+    val kinesisClient    = ("com.amazonaws"            %  "amazon-kinesis-client"     % V.kinesisClient)
+      .exclude("com.fasterxml.jackson.dataformat", "jackson-dataformat-cbor")
+    val kinesisConnector = ("com.amazonaws"            %  "amazon-kinesis-connectors" % V.kinesisConnector)
+      .exclude("com.fasterxml.jackson.dataformat", "jackson-dataformat-cbor")
+    val jacksonCbor      = "com.fasterxml.jackson.dataformat" % "jackson-dataformat-cbor" % V.jacksonCbor
     val hadoop           = ("org.apache.hadoop"        %  "hadoop-common"             % V.hadoop)
       .exclude("org.slf4j", "slf4j-log4j12")
       .exclude("commons-beanutils", "commons-beanutils")
@@ -56,6 +62,7 @@ object Dependencies {
     val hadoopLZO        = "com.hadoop.gplcompression" %  "hadoop-lzo"                % V.hadoopLZO
     val jodaTime         = "joda-time"                 %  "joda-time"                 % V.jodaTime
     val config           = "com.typesafe"              %  "config"                    % V.config
+    val nsqClient        = "com.snowplowanalytics"     %  "nsq-java-client_2.10"      % V.nsqClient
     // Thrift (test only)
     val collectorPayload = "com.snowplowanalytics"     %  "collector-payload-1"       % V.collectorPayload % "test"
     // Scala
diff --git a/src/main/resources/config.hocon.sample b/src/main/resources/config.hocon.sample
@@ -1,86 +1,106 @@
 # Default configuration for kinesis-lzo-s3-sink
 
-sink {
-
-  # The following are used to authenticate for the Amazon Kinesis sink.
-  #
-  # If both are set to 'default', the default provider chain is used
-  # (see http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html)
-  #
-  # If both are set to 'iam', use AWS IAM Roles to provision credentials.
-  #
-  # If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
-  aws {
-    access-key: "iam"
-    secret-key: "iam"
-  }
+# Sources currently supported are:
+# 'kinesis' for reading records from a Kinesis stream
+# 'nsq' for reading records from a NSQ topic
+source: "{{source}}"
 
-  kinesis {
-    in {
-      # Kinesis input stream name
-      stream-name: "{{sinkKinesisInStreamName}}"
+# Sinks currently supported are:
+# 'kinesis' for writing records to a Kinesis stream
+# 'nsq' for writing records to a NSQ topic
+sink: "{{sink}}"
 
-      # LATEST: most recent data.
-      # TRIM_HORIZON: oldest available data.
-      # Note: This only affects the first run of this application
-      # on a stream.
-      initial-position: "TRIM_HORIZON"
+# The following are used to authenticate for the Amazon Kinesis sink.
+#
+# If both are set to 'default', the default provider chain is used
+# (see http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html)
+#
+# If both are set to 'iam', use AWS IAM Roles to provision credentials.
+#
+# If both are set to 'env', use environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+aws {
+  access-key: "iam"
+  secret-key: "iam"
+}
 
-      # Maximum number of records to read per GetRecords call     
-      max-records: {{sinkKinesisMaxRecords}}
-    }
+# Config for NSQ
+nsq {
+  # Channel name for NSQ source
+  channel-name: "{{NsqSourceChannelName}}"
+    
+  # Host name for NSQ tools
+  host: "{{NsqHost}}"
 
-    out {
-      # Stream for events for which the storage process fails
-      stream-name: "{{sinkKinesisOutStreamName}}"
-    }
+  # Port for nsqd
+  port: "{{NsqdPort}}"
 
-    region: "{{sinkKinesisRegion}}"
+  # Port for nsqlookupd
+  lookup-port: {{NsqlookupdPort}}
+}
 
-    # "app-name" is used for a DynamoDB table to maintain stream state.
-    # You can set it automatically using: "SnowplowLzoS3Sink-$\\{sink.kinesis.in.stream-name\\}"
-    app-name: "{{sinkKinesisAppName}}"
-  }
+kinesis {
+  # LATEST: most recent data.
+  # TRIM_HORIZON: oldest available data.
+  # Note: This only affects the first run of this application
+  # on a stream.
+  initial-position: "TRIM_HORIZON"
 
-  s3 {
-    # If using us-east-1, then endpoint should be "http://s3.amazonaws.com".
-    # Otherwise "http://s3-<<region>>.s3.amazonaws.com", e.g.
-    # http://s3-eu-west-1.amazonaws.com
-    region: "{{sinkKinesisS3Region}}"
-    bucket: "{{sinkKinesisS3Bucket}}"
+  # Maximum number of records to read per GetRecords call     
+  max-records: {{sinkKinesisMaxRecords}}
 
-    # Format is one of lzo or gzip
-    # Note, that you can use gzip only for enriched data stream.
-    format: "{{sinkKinesisFormat}}"
+  region: "{{sinkKinesisRegion}}"
 
-    # Maximum Timeout that the application is allowed to fail for
-    max-timeout: {{sinkKinesisMaxTimeout}}
-  }
+  # "app-name" is used for a DynamoDB table to maintain stream state.
+  # You can set it automatically using: "SnowplowLzoS3Sink-$\\{sink.kinesis.in.stream-name\\}"
+  app-name: "{{sinkKinesisAppName}}"
+}
+
+streams {
+  # Input stream name
+  stream-name-in = "{{InStreamName}}"
+
+  # Stream for events for which the storage process fails
+  stream-name-out = "{{OutStreamName}}"
 
   # Events are accumulated in a buffer before being sent to S3.
   # The buffer is emptied whenever:
   # - the combined size of the stored records exceeds byte-limit or
   # - the number of stored records exceeds record-limit or
   # - the time in milliseconds since it was last emptied exceeds time-limit
   buffer {
-    byte-limit: {{sinkLzoBufferByteThreshold}}
+    byte-limit: {{sinkLzoBufferByteThreshold}} # Not supported by NSQ; will be ignored
     record-limit: {{sinkLzoBufferRecordThreshold}}
-    time-limit: {{sinkLzoBufferTimeThreshold}}
+    time-limit: {{sinkLzoBufferTimeThreshold}} # Not supported by NSQ; will be ignored
   }
+}
 
-  # Set the Logging Level for the S3 Sink
-  # Options: ERROR, WARN, INFO, DEBUG, TRACE
-  logging {
-    level: "{{sinkLzoLogLevel}}"
-  }
+s3 {
+  # If using us-east-1, then endpoint should be "http://s3.amazonaws.com".
+  # Otherwise "http://s3-<<region>>.s3.amazonaws.com", e.g.
+  # http://s3-eu-west-1.amazonaws.com
+  region: "{{sinkKinesisS3Region}}"
+  bucket: "{{sinkKinesisS3Bucket}}"
+
+  # Format is one of lzo or gzip
+  # Note, that you can use gzip only for enriched data stream.
+  format: "{{sinkKinesisFormat}}"
+
+  # Maximum Timeout that the application is allowed to fail for
+  max-timeout: {{sinkKinesisMaxTimeout}}
+}
+
+# Set the Logging Level for the S3 Sink
+# Options: ERROR, WARN, INFO, DEBUG, TRACE
+logging {
+  level: "{{sinkLzoLogLevel}}"
+}
 
-  # Optional section for tracking endpoints
-  monitoring {
-    snowplow {
-      collector-uri: "{{collectorUri}}"
-      collector-port: 80
-      app-id: "{{sinkLzoAppName}}"
-      method: "GET"
-    }
+# Optional section for tracking endpoints
+monitoring {
+  snowplow {
+    collector-uri: "{{collectorUri}}"
+    collector-port: 80
+    app-id: "{{sinkLzoAppName}}"
+    method: "GET"
   }
 }
diff --git a/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/NsqSourceExecutor.scala b/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/NsqSourceExecutor.scala
@@ -0,0 +1,151 @@
+/**
+ * Copyright (c) 2014-2016 Snowplow Analytics Ltd.
+ * All rights reserved.
+ *
+ * This program is licensed to you under the Apache License Version 2.0,
+ * and you may not use this file except in compliance with the Apache
+ * License Version 2.0.
+ * You may obtain a copy of the Apache License Version 2.0 at
+ * http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the Apache License Version 2.0 is distributed
+ * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.
+ *
+ * See the Apache License Version 2.0 for the specific language
+ * governing permissions and limitations there under.
+ */
+
+package com.snowplowanalytics.snowplow.storage.kinesis.s3
+
+// NSQ
+import com.snowplowanalytics.client.nsq.NSQConsumer
+import com.snowplowanalytics.client.nsq.lookup.DefaultNSQLookup
+import com.snowplowanalytics.client.nsq.NSQMessage
+import com.snowplowanalytics.client.nsq.NSQConfig
+import com.snowplowanalytics.client.nsq.callbacks.NSQMessageCallback
+import com.snowplowanalytics.client.nsq.callbacks.NSQErrorCallback
+import com.snowplowanalytics.client.nsq.exceptions.NSQException
+
+// Scala
+import scala.collection.mutable.ListBuffer
+import scala.collection.JavaConversions._
+import scala.util.Random
+
+// Tracker
+import com.snowplowanalytics.snowplow.scalatracker.Tracker
+
+// Scalaz
+import scalaz._
+import Scalaz._
+
+// Joda-Time
+import org.joda.time.{DateTime, DateTimeZone}
+import org.joda.time.format.DateTimeFormat
+
+// Logging
+import org.slf4j.LoggerFactory
+
+// This project
+import sinks._
+import serializers._
+
+/**
+ * Executor for NSQSource
+ *
+ * @param config the kinesis config for getting informations for S3
+ * @param nsqConfig the NSQ configuration
+ * @param badSink the configured BadSink
+ * @param serializer the instance of one of the serializer
+ * @param maxConnectionTime max time for trying to connect S3 instance 
+ */
+class NsqSourceExecutor(
+  config: KinesisConnectorConfiguration,
+  nsqConfig: S3LoaderNsqConfig,
+  badSink: ISink,
+  serializer: ISerializer, 
+  maxConnectionTime: Long,
+  tracker: Option[Tracker]
+) extends Runnable {
+
+  lazy val log = LoggerFactory.getLogger(getClass())
+
+  //nsq messages will be buffered in msgBuffer until buffer size become equal to nsqBufferSize
+  val msgBuffer = new ListBuffer[EmitterInput]()
+
+  val s3Emitter = new S3Emitter(config, badSink, maxConnectionTime, tracker)
+  private val TimeFormat = DateTimeFormat.forPattern("HH:mm:ss.SSS").withZone(DateTimeZone.UTC)
+  private val DateFormat = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.UTC)
+
+  private def getBaseFilename(startTime: Long, endTime: Long): String = {
+    def abs (e: Int) = if (e > 0) e else -e
+    val currentTimeObject = new DateTime(System.currentTimeMillis())
+    val startTimeObject = new DateTime(startTime)
+    val endTimeObject = new DateTime(endTime)
+    val rand = Random
+    val randNum = rand.nextInt
+
+    DateFormat.print(currentTimeObject) + "-" + 
+      TimeFormat.print(startTimeObject) + "-" +
+      TimeFormat.print(endTimeObject)   + "-" + abs(randNum)
+  }
+  
+  override def run: Unit = {
+
+    val nsqCallback = new  NSQMessageCallback {
+      //start time of filling the buffer
+      var bufferStartTime = System.currentTimeMillis()
+      val nsqBufferSize = config.streams.buffer.recordLimit
+
+      override def message(msg: NSQMessage): Unit = {
+        val validMsg = msg.getMessage.success
+        msgBuffer.synchronized {
+          msgBuffer += validMsg
+          msg.finished()
+          if (msgBuffer.size >= nsqBufferSize) {
+            //finish time of filling the buffer
+            val bufferEndTime = System.currentTimeMillis()
+            val baseFilename = getBaseFilename(bufferStartTime, bufferEndTime)
+            val serializationResults = serializer.serialize(msgBuffer.toList, baseFilename)
+            val (successes, failures) = serializationResults.results.partition(_.isSuccess)
+
+            if (successes.size > 0) {
+              serializationResults.namedStreams.foreach { 
+                val connectionAttemptStartTime = System.currentTimeMillis()
+                s3Emitter.attemptEmit(_, connectionAttemptStartTime) match {
+                  case false => log.error(s"Error while sending to S3")
+                  case true => 
+                }
+              }
+            }
+
+            if (failures.size > 0) {
+              s3Emitter.sendFailures(failures)
+            }
+
+            msgBuffer.clear()
+            //make buffer start time of the next buffer the buffer finish time of the current buffer 
+            bufferStartTime = bufferEndTime
+          }                
+        }
+      }
+    }
+
+    val errorCallback = new NSQErrorCallback {
+      override def error(e: NSQException) =
+        log.error(s"Exception while consuming topic $nsqConfig.nsqGoodSourceTopicName", e)
+    }
+
+    val lookup = new DefaultNSQLookup
+    // use NSQLookupd
+    lookup.addLookupAddress(nsqConfig.nsqHost, nsqConfig.nsqlookupPort)
+    val consumer = new NSQConsumer(lookup,
+                                   nsqConfig.nsqSourceTopicName,
+                                   nsqConfig.nsqSourceChannelName,
+                                   nsqCallback,
+                                   new NSQConfig(),
+                                   errorCallback)
+    consumer.start()
+  }   
+}
diff --git a/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/SinkApp.scala b/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/SinkApp.scala
diff --git a/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/sinks/NsqSink.scala b/src/main/scala/com.snowplowanalytics.snowplow.storage.kinesis/s3/sinks/NsqSink.scala