-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataSelection.py
93 lines (75 loc) · 2.85 KB
/
dataSelection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# ----------------------------------------------------------------
# Select SOCC comment threads of length at least 700 words.
#
# (C) 2020 Laurens Bosman, Discourse Processing Lab, SFU
# Released under GNU General Public License (GPL)
# email [email protected]
# ----------------------------------------------------------------
import pandas as pd
import nltk
from nltk import word_tokenize
import csv
from pathlib import Path
def findThreads(inputFile, outputFolder):
# import comment data from csv
data = pd.read_csv(inputFile)
# setup initial variables
first = data["comment_counter"].iloc[0]
last_id = first.split('_')[1] + first.split('_')[2]
threadSize = 0
numComments = 0
dataDict = {}
threadsTotal = 0
with open('%sgnm_threads.csv' %outputFolder, mode='w') as output:
writer = csv.writer(output, delimiter=',')
writer.writerow(["id"] + ["text"])
for index, row in data.iterrows():
# if current comment is in same thread add to dict
if row[1].split('_')[1] + row[1].split('_')[2] == last_id:
dataDict[row[1]] = row[6]
threadSize += len(word_tokenize(row[6]))
numComments += 1
last_id = row[1].split('_')[1] + row[1].split('_')[2]
continue
if threadSize >= 700 and numComments > 2:
# store thread in output file
threadsTotal += 1
writer = csv.writer(output, delimiter=',')
for x in dataDict:
writer.writerow([x] + [dataDict[x]])
# reset variables for next thread
dataDict.clear()
threadSize = 0
numComments = 0
# initialize variables for next thread using values of initial comment
dataDict[row[1]] = row[6]
threadSize += len(word_tokenize(row[6]))
numComments += 1
last_id = row[1].split('_')[1] + row[1].split('_')[2]
def storeThreads(outputFolder):
data = pd.read_csv('%sgnm_threads.csv' %outputFolder)
first = data["id"].iloc[0]
last = first.split('_')[1] + first.split('_')[2]
text = ""
filename = first
for index, row in data.iterrows():
if row[0].split('_')[1] + row[0].split('_')[2] == last:
text += row[1] + " "
else:
with open('%sthreads/%s.txt' %(outputFolder, filename), 'w') as out:
out.write(text)
text = row[1] + " "
filename = row[0]
last = row[0].split('_')[1] + row[0].split('_')[2]
def main(inputFile, outputFolder):
Path(outputFolder).mkdir(parents=True, exist_ok=True)
Path(outputFolder + "threads/").mkdir(parents=True, exist_ok=True)
findThreads(inputFile, outputFolder)
storeThreads(outputFolder)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='select comment threads of more than 700 words')
parser.add_argument('inputFile', type=str, help='the path to the input csv')
parser.add_argument('outputFolder', type=str, help='the path to the output folder; eg. ./folder1/folder2/')
args = parser.parse_args()
main(args.inputFile, args.outputFolder)