-
-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathCsvFileReader.kt
172 lines (159 loc) · 7.04 KB
/
CsvFileReader.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
package com.github.doyaaaaaken.kotlincsv.client
import com.github.doyaaaaaken.kotlincsv.dsl.context.CSVReaderNullFieldIndicator
import com.github.doyaaaaaken.kotlincsv.dsl.context.CsvReaderContext
import com.github.doyaaaaaken.kotlincsv.dsl.context.ExcessFieldsRowBehaviour
import com.github.doyaaaaaken.kotlincsv.dsl.context.InsufficientFieldsRowBehaviour
import com.github.doyaaaaaken.kotlincsv.parser.CsvParser
import com.github.doyaaaaaken.kotlincsv.parser.ParserNullFieldIndicator
import com.github.doyaaaaaken.kotlincsv.util.CSVAutoRenameFailedException
import com.github.doyaaaaaken.kotlincsv.util.CSVFieldNumDifferentException
import com.github.doyaaaaaken.kotlincsv.util.MalformedCSVException
import com.github.doyaaaaaken.kotlincsv.util.logger.Logger
/**
* CSV Reader class, which controls file I/O flow.
*
* @author doyaaaaaken
*/
class CsvFileReader internal constructor(
private val ctx: CsvReaderContext,
reader: Reader,
private val logger: Logger,
) {
private val reader = BufferedLineReader(reader)
private var rowNum = 0L
private val parser = CsvParser(ctx.quoteChar, ctx.delimiter, ctx.escapeChar, ctx.withFieldAsNull.toParserNullFieldIndicator())
/**
* read next csv row
* (which may contain multiple lines, because csv fields may contain line feed)
*
* @return return fields in row as List<String>.
* or return null, if all line are already read.
*/
@Deprecated("We are considering making it a private method. If you have feedback, please comment on Issue #100.")
fun readNext(): List<String?>? {
return readUntilNextCsvRow("")
}
/**
* read all csv rows as Sequence
*/
fun readAllAsSequence(fieldsNum: Int? = null): Sequence<List<String?>> {
var expectedNumFieldsInRow: Int? = fieldsNum
return generateSequence {
@Suppress("DEPRECATION") readNext()
}.mapIndexedNotNull { idx, row ->
// If no expected number of fields was passed in, then set it based on the first row.
if (expectedNumFieldsInRow == null) expectedNumFieldsInRow = row.size
// Assign this number to a non-nullable type to avoid need for thread-safety null checks.
val numFieldsInRow: Int = expectedNumFieldsInRow ?: row.size
@Suppress("DEPRECATION")
if (row.size > numFieldsInRow) {
if (ctx.excessFieldsRowBehaviour == ExcessFieldsRowBehaviour.TRIM) {
logger.info("trimming excess rows. [csv row num = ${idx + 1}, fields num = ${row.size}, fields num of row = $numFieldsInRow]")
row.subList(0, numFieldsInRow)
} else if (ctx.skipMissMatchedRow || ctx.excessFieldsRowBehaviour == ExcessFieldsRowBehaviour.IGNORE) {
skipMismatchedRow(idx, row, numFieldsInRow)
} else {
throw CSVFieldNumDifferentException(numFieldsInRow, row.size, idx + 1)
}
} else if (numFieldsInRow != row.size) {
if (ctx.skipMissMatchedRow || ctx.insufficientFieldsRowBehaviour == InsufficientFieldsRowBehaviour.IGNORE) {
skipMismatchedRow(idx, row, numFieldsInRow)
} else if (ctx.insufficientFieldsRowBehaviour == InsufficientFieldsRowBehaviour.EMPTY_STRING) {
val numOfMissingFields = numFieldsInRow - row.size
row.plus(List(numOfMissingFields) { "" })
} else {
throw CSVFieldNumDifferentException(numFieldsInRow, row.size, idx + 1)
}
} else {
row
}
}
}
private fun skipMismatchedRow(
idx: Int,
row: List<String?>,
numFieldsInRow: Int
): Nothing? {
logger.info("skip miss matched row. [csv row num = ${idx + 1}, fields num = ${row.size}, fields num of first row = $numFieldsInRow]")
return null
}
/**
* read all csv rows as Sequence with header information
*/
fun readAllWithHeaderAsSequence(): Sequence<Map<String, String?>> {
@Suppress("DEPRECATION")
var headers = readNext()?.map { it ?: "" } ?: return emptySequence()
if (ctx.autoRenameDuplicateHeaders) {
headers = deduplicateHeaders(headers)
} else {
val duplicated = findDuplicate(headers)
if (duplicated != null) throw MalformedCSVException("header '$duplicated' is duplicated. please consider to use 'autoRenameDuplicateHeaders' option.")
}
return readAllAsSequence(headers.size).map { fields -> headers.zip(fields).toMap() }
}
fun close() {
reader.close()
}
/**
* read next csv row (which may contain multiple lines)
*
* @return return fields in row as List<String>.
* or return null, if all line are already read.
*/
private tailrec fun readUntilNextCsvRow(leftOver: String = ""): List<String?>? {
val nextLine = reader.readLineWithTerminator()
rowNum++
return if (nextLine == null) {
if (leftOver.isNotEmpty()) {
throw MalformedCSVException("\"$leftOver\" on the tail of file is left on the way of parsing row")
} else {
null
}
} else if (ctx.skipEmptyLine && nextLine.isBlank() && leftOver.isBlank()) {
readUntilNextCsvRow(leftOver)
} else {
val value = if (leftOver.isEmpty()) {
"$nextLine"
} else {
"$leftOver$nextLine"
}
parser.parseRow(value, rowNum) ?: readUntilNextCsvRow("$leftOver$nextLine")
}
}
private fun findDuplicate(headers: List<String>): String? {
val set = mutableSetOf<String>()
headers.forEach { h ->
if (set.contains(h)) {
return h
} else {
set.add(h)
}
}
return null
}
/**
* deduplicate headers based on occurrence by appending "_<NUM>"
* Ex: [a,b,b,b,c,a] => [a,b,b_2,b_3,c,a_2]
*
* @return return headers as List<String>.
*/
private fun deduplicateHeaders(headers: List<String>): List<String> {
val occurrences = mutableMapOf<String, Int>()
return headers.map { header ->
val count = occurrences.getOrPut(header) { 0 } + 1
occurrences[header] = count
when {
count > 1 -> "${header}_$count"
else -> header
}
}.also { results ->
if (results.size != results.distinct().size) throw CSVAutoRenameFailedException()
}
}
private fun CSVReaderNullFieldIndicator.toParserNullFieldIndicator() = when(this) {
CSVReaderNullFieldIndicator.EMPTY_SEPARATORS -> ParserNullFieldIndicator.EMPTY_SEPARATORS
CSVReaderNullFieldIndicator.EMPTY_QUOTES -> ParserNullFieldIndicator.EMPTY_QUOTES
CSVReaderNullFieldIndicator.BOTH -> ParserNullFieldIndicator.BOTH
CSVReaderNullFieldIndicator.NEITHER -> ParserNullFieldIndicator.NEITHER
}
}