Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 79e9b91

Browse files
committedSep 10, 2019
Add type partition for bad row partition persistence (closes #146)
1 parent b0488e5 commit 79e9b91

File tree

8 files changed

+218
-17
lines changed

8 files changed

+218
-17
lines changed
 

‎build.sbt

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ lazy val root = project.in(file("."))
3939
Dependencies.Libraries.json4sJackson,
4040
Dependencies.Libraries.snowplowTracker,
4141
Dependencies.Libraries.pureconfig,
42+
Dependencies.Libraries.igluCoreJson4s,
4243
// Scala (test only)
4344
Dependencies.Libraries.specs2,
4445
// Thrift (test only)

‎project/Dependencies.scala

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ object Dependencies {
3939
val scalaz7 = "7.2.13"
4040
val snowplowTracker = "0.3.0"
4141
val pureconfig = "0.8.0"
42+
val igluCore = "0.5.0"
4243
// Scala (test only)
4344
val specs2 = "3.9.1"
4445
}
@@ -74,6 +75,7 @@ object Dependencies {
7475
val scalaz7 = "org.scalaz" %% "scalaz-core" % V.scalaz7
7576
val snowplowTracker = "com.snowplowanalytics" %% "snowplow-scala-tracker" % V.snowplowTracker
7677
val pureconfig = "com.github.pureconfig" %% "pureconfig" % V.pureconfig
78+
val igluCoreJson4s = "com.snowplowanalytics" %% "iglu-core-json4s" % V.igluCore
7779
// Scala (test only)
7880
val specs2 = "org.specs2" %% "specs2-core" % V.specs2 % "test"
7981
}

‎src/main/scala/com.snowplowanalytics.s3/loader/KinesisS3Emitter.scala

+91-11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ package com.snowplowanalytics.s3.loader
1414

1515
// Scala
1616
import scala.collection.JavaConverters._
17+
import scala.util.Try
18+
import scala.util.{Success => TrySuccess}
1719

1820
// Java libs
1921
import java.util.Calendar
@@ -32,10 +34,21 @@ import scala.collection.JavaConversions._
3234
// Tracker
3335
import com.snowplowanalytics.snowplow.scalatracker.Tracker
3436

37+
// Json4s
38+
import org.json4s.jackson.JsonMethods.parse
39+
40+
// Iglu Core
41+
import com.snowplowanalytics.iglu.core._
42+
import com.snowplowanalytics.iglu.core.json4s.implicits._
43+
44+
// Scalaz
45+
import scalaz._
46+
3547
// This project
3648
import sinks._
3749
import serializers._
3850
import model._
51+
import KinesisS3Emitter._
3952

4053
/**
4154
* Emitter for flushing Kinesis event data to S3.
@@ -48,19 +61,12 @@ class KinesisS3Emitter(
4861
badSink: ISink,
4962
serializer: ISerializer,
5063
maxConnectionTime: Long,
51-
tracker: Option[Tracker]
64+
tracker: Option[Tracker],
65+
partition: Boolean,
66+
partitionErrorDir: String
5267
) extends IEmitter[EmitterInput] {
5368

5469
val s3Emitter = new S3Emitter(s3Config, provider, badSink, maxConnectionTime, tracker)
55-
val dateFormat = new SimpleDateFormat("yyyy-MM-dd");
56-
57-
/**
58-
* Determines the filename in S3, which is the corresponding
59-
* Kinesis sequence range of records in the file.
60-
*/
61-
protected def getBaseFilename(firstSeq: String, lastSeq: String): String =
62-
dateFormat.format(Calendar.getInstance().getTime()) +
63-
"-" + firstSeq + "-" + lastSeq
6470

6571
/**
6672
* Reads items from a buffer and saves them to s3.
@@ -77,7 +83,19 @@ class KinesisS3Emitter(
7783
s3Emitter.log.info(s"Flushing buffer with ${buffer.getRecords.size} records.")
7884

7985
val records = buffer.getRecords().asScala.toList
80-
val baseFilename = getBaseFilename(buffer.getFirstSequenceNumber, buffer.getLastSequenceNumber)
86+
if (partition) {
87+
partitionWithSchemaKey(records, partitionErrorDir).foldLeft(List[EmitterInput]()) {
88+
case (acc, (prefix, l)) =>
89+
val baseFilename = getBaseFilename(buffer.getFirstSequenceNumber, buffer.getLastSequenceNumber, Some(prefix.getName))
90+
acc ::: emitRecords(l, baseFilename)
91+
}
92+
} else {
93+
val baseFilename = getBaseFilename(buffer.getFirstSequenceNumber, buffer.getLastSequenceNumber)
94+
emitRecords(records, baseFilename)
95+
}
96+
}
97+
98+
private def emitRecords(records: List[EmitterInput], baseFilename: String) = {
8199
val serializationResults = serializer.serialize(records, baseFilename)
82100
val (successes, failures) = serializationResults.results.partition(_.isSuccess)
83101

@@ -108,3 +126,65 @@ class KinesisS3Emitter(
108126
override def fail(records: java.util.List[EmitterInput]): Unit =
109127
s3Emitter.sendFailures(records)
110128
}
129+
130+
object KinesisS3Emitter {
131+
132+
/** Type of row which determined according to schema of self describing data */
133+
sealed trait RowType extends Product with Serializable {
134+
def getName: String
135+
}
136+
137+
object RowType {
138+
/** Represents cases where row type could not be determined
139+
* since either row is not valid json or it is not self
140+
* describing json
141+
*/
142+
case class PartitionError(errorDir: String) extends RowType {
143+
override def getName: String = errorDir
144+
}
145+
146+
/** Represents cases where type of row can be determined successfully
147+
* e.g. does have proper schema key
148+
*/
149+
case class SelfDescribing(rowType: String) extends RowType {
150+
override def getName: String = rowType
151+
}
152+
153+
case object UnexpectedError extends RowType {
154+
override def getName: String = "unexpected_error"
155+
}
156+
}
157+
158+
val dateFormat = new SimpleDateFormat("yyyy-MM-dd");
159+
160+
/**
161+
* Determines the filename in S3, which is the corresponding
162+
* Kinesis sequence range of records in the file.
163+
*/
164+
private def getBaseFilename(firstSeq: String, lastSeq: String, prefix: Option[String] = None): String =
165+
prefix.map(p => if (p.isEmpty) "" else p + "/").getOrElse("") + dateFormat.format(Calendar.getInstance().getTime()) +
166+
"-" + firstSeq + "-" + lastSeq
167+
168+
/**
169+
* Assume records are self describing data and group them according
170+
* to their schema key. Put records which are not self describing data
171+
* to under "old bad row type".
172+
*/
173+
private[loader] def partitionWithSchemaKey(records: List[EmitterInput], errorDir: String) = {
174+
records.groupBy {
175+
case Success(byteRecord) =>
176+
val strRecord = new String(byteRecord, "UTF-8")
177+
Try(parse(strRecord)) match {
178+
case TrySuccess(e) =>
179+
val json = parse(strRecord)
180+
val schemaKey = SchemaKey.extract(json)
181+
schemaKey.fold(
182+
e => RowType.PartitionError(errorDir),
183+
k => RowType.SelfDescribing(s"${k.vendor}.${k.name}")
184+
)
185+
case _ => RowType.PartitionError(errorDir)
186+
}
187+
case _ => RowType.UnexpectedError
188+
}
189+
}
190+
}

‎src/main/scala/com.snowplowanalytics.s3/loader/KinesisS3Pipeline.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ import model._
2828
/**
2929
* S3Pipeline class sets up the Emitter/Buffer/Transformer/Filter
3030
*/
31-
class KinesisS3Pipeline(s3Config: S3Config, badSink: ISink, serializer: ISerializer, maxConnectionTime: Long, tracker: Option[Tracker]) extends IKinesisConnectorPipeline[ValidatedRecord, EmitterInput] {
31+
class KinesisS3Pipeline(s3Config: S3Config, badSink: ISink, serializer: ISerializer, maxConnectionTime: Long, tracker: Option[Tracker], partition: Boolean, partitionErrorDir: String) extends IKinesisConnectorPipeline[ValidatedRecord, EmitterInput] {
3232

33-
override def getEmitter(configuration: KinesisConnectorConfiguration) = new KinesisS3Emitter(s3Config, configuration.AWS_CREDENTIALS_PROVIDER, badSink, serializer, maxConnectionTime, tracker)
33+
override def getEmitter(configuration: KinesisConnectorConfiguration) = new KinesisS3Emitter(s3Config, configuration.AWS_CREDENTIALS_PROVIDER, badSink, serializer, maxConnectionTime, tracker, partition, partitionErrorDir)
3434

3535
override def getBuffer(configuration: KinesisConnectorConfiguration) = new BasicMemoryBuffer[ValidatedRecord](configuration)
3636

‎src/main/scala/com.snowplowanalytics.s3/loader/KinesisSourceExecutor.scala

+4-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ class KinesisSourceExecutor(
4949
badSink: ISink,
5050
serializer: ISerializer,
5151
maxConnectionTime: Long,
52-
tracker: Option[Tracker]
52+
tracker: Option[Tracker],
53+
partition: Boolean,
54+
partitionErrorDir: String
5355
) extends KinesisConnectorExecutorBase[ValidatedRecord, EmitterInput] {
5456

5557
val LOG = LoggerFactory.getLogger(getClass)
@@ -113,5 +115,5 @@ class KinesisSourceExecutor(
113115
initialize(config, null)
114116

115117
override def getKinesisConnectorRecordProcessorFactory =
116-
new KinesisConnectorRecordProcessorFactory[ValidatedRecord, EmitterInput](new KinesisS3Pipeline(s3Config, badSink, serializer, maxConnectionTime, tracker), config)
118+
new KinesisConnectorRecordProcessorFactory[ValidatedRecord, EmitterInput](new KinesisS3Pipeline(s3Config, badSink, serializer, maxConnectionTime, tracker, partition, partitionErrorDir), config)
117119
}

‎src/main/scala/com.snowplowanalytics.s3/loader/SinkApp.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ object SinkApp {
149149
badSink,
150150
serializer,
151151
maxConnectionTime,
152-
tracker
152+
tracker,
153+
config.partition.getOrElse(false),
154+
config.partitionErrorDir.getOrElse("")
153155
).success
154156
// Read records from NSQ
155157
case "nsq" =>

‎src/main/scala/com.snowplowanalytics.s3/loader/model.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ package model {
9191
kinesis: KinesisConfig,
9292
streams: StreamsConfig,
9393
s3: S3Config,
94-
monitoring: Option[MonitoringConfig]
94+
monitoring: Option[MonitoringConfig],
95+
partition: Option[Boolean],
96+
partitionErrorDir: Option[String]
9597
)
9698
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* Copyright (c) 2014-2019 Snowplow Analytics Ltd. All rights reserved.
3+
*
4+
* This program is licensed to you under the Apache License Version 2.0,
5+
* and you may not use this file except in compliance with the Apache License Version 2.0.
6+
* You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
7+
*
8+
* Unless required by applicable law or agreed to in writing,
9+
* software distributed under the Apache License Version 2.0 is distributed on an
10+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12+
*/
13+
package com.snowplowanalytics.s3.loader
14+
15+
import scalaz._
16+
17+
import org.specs2.mutable.Specification
18+
19+
class KinesisS3EmitterSpec extends Specification {
20+
21+
"KinesisS3Emitter" should {
22+
23+
"partition records correctly according to schema key" in {
24+
val dataType11 =
25+
"""
26+
| {
27+
| "schema": "iglu:com.acme1/example1/jsonschema/2-0-1",
28+
| "data": "data1"
29+
| }
30+
""".stripMargin.getBytes("UTF-8")
31+
32+
val dataType21 =
33+
"""
34+
| {
35+
| "schema": "iglu:com.acme1/example2/jsonschema/2-0-0",
36+
| "data": "data1"
37+
| }
38+
""".stripMargin.getBytes("UTF-8")
39+
40+
val dataType22 =
41+
"""
42+
| {
43+
| "schema": "iglu:com.acme1/example2/jsonschema/2-0-1",
44+
| "data": "data2"
45+
| }
46+
""".stripMargin.getBytes("UTF-8")
47+
48+
val dataType31 =
49+
"""
50+
| {
51+
| "schema": "iglu:com.acme2/example1/jsonschema/2-0-0",
52+
| "data": "data1"
53+
| }
54+
""".stripMargin.getBytes("UTF-8")
55+
56+
val dataType32 =
57+
"""
58+
| {
59+
| "schema": "iglu:com.acme2/example1/jsonschema/2-0-1",
60+
| "data": "data2"
61+
| }
62+
""".stripMargin.getBytes("UTF-8")
63+
64+
val dataType33 =
65+
"""
66+
| {
67+
| "schema": "iglu:com.acme2/example1/jsonschema/2-0-1",
68+
| "data": "data3"
69+
| }
70+
""".stripMargin.getBytes("UTF-8")
71+
72+
val nonSelfDescribingJson =
73+
"""
74+
| {
75+
| "data": "data",
76+
| "key": "value"
77+
| }
78+
""".stripMargin.getBytes("UTF-8")
79+
80+
val nonJsonData = "nonJsonData".getBytes("UTF-8")
81+
82+
val failure = FailedRecord(List("error1", "error2"), "line")
83+
84+
val records = List(
85+
Success(dataType11),
86+
Success(dataType21),
87+
Success(dataType22),
88+
Success(dataType31),
89+
Success(dataType32),
90+
Success(dataType33),
91+
Failure(failure),
92+
Failure(failure),
93+
Success(nonSelfDescribingJson),
94+
Success(nonJsonData)
95+
)
96+
97+
val res = KinesisS3Emitter.partitionWithSchemaKey(records, "partition_error_dir")
98+
.map { case (rowType, l) => (rowType.getName, l)}
99+
100+
val expected = Map(
101+
"com.acme1.example1" -> List(Success(dataType11)),
102+
"com.acme1.example2" -> List(Success(dataType21), Success(dataType22)),
103+
"com.acme2.example1" -> List(Success(dataType31), Success(dataType32), Success(dataType33)),
104+
"partition_error_dir" -> List(Success(nonSelfDescribingJson), Success(nonJsonData)),
105+
"unexpected_error" -> List(Failure(failure), Failure(failure))
106+
)
107+
108+
res must beEqualTo(expected)
109+
}
110+
}
111+
112+
}

0 commit comments

Comments
 (0)
Please sign in to comment.