1
+ import numpy as np
2
+ import os
3
+ import pickle
4
+ import csv
5
+ import datetime as dt
6
+
7
+ def preprocess_data (char_set ,input_case = ["all" ],log_buffer = dt .timedelta (seconds = 1 )):
8
+ """
9
+ char_set: set of characters for which input has to be considered
10
+ input_case: one of ['room'], ['library'] or 'all'
11
+ log_buffer: time duration of type timedelta giving the buffer allowed between two successive keyfreq readings
12
+ """
13
+ if input_case == "all" :
14
+ input_case = ["room" ,"library" ]
15
+ final_log_keystrokes = []
16
+ final_lefthand_gyro = []
17
+ final_lefthand_accel = []
18
+ final_lefthand_gravity = []
19
+ final_righthand_gyro = []
20
+ final_righthand_accel = []
21
+ final_righthand_gravity = []
22
+ keyfreq_folder = '../Data/keyfreq/'
23
+ righthand_folder = '../Data/righthand/'
24
+ lefthand_folder = '../Data/lefthand/'
25
+ for folder in input_case :
26
+ log_files = sorted (os .listdir (keyfreq_folder + folder ))
27
+ righthand_files = sorted (os .listdir (righthand_folder + folder ))
28
+ lefthand_files = sorted (os .listdir (lefthand_folder + folder ))
29
+ log_files = [os .path .join (keyfreq_folder ,folder ,f ) for f in log_files ]
30
+ lefthand_files = [os .path .join (lefthand_folder ,folder ,f ) for f in lefthand_files ]
31
+ righthand_files = [os .path .join (righthand_folder ,folder ,f ) for f in righthand_files ]
32
+ print "total number of files in " , folder ,":" , len (log_files )
33
+ #going over keylogged files first
34
+ for file_num ,log_file in enumerate (log_files ):
35
+ curr_log_keystrokes = []
36
+ curr_lefthand_gyro = []
37
+ curr_lefthand_accel = []
38
+ curr_lefthand_gravity = []
39
+ curr_righthand_gyro = []
40
+ curr_righthand_accel = []
41
+ curr_righthand_gravity = []
42
+ with open (log_file ) as curr_log_f :
43
+ line0 = curr_log_f .readline ()
44
+ line0_split = line0 .split ()
45
+ time0_split = line0_split [0 ].split (':' )
46
+ line0_time = dt .datetime .fromtimestamp (float (time0_split [0 ]))
47
+ line0_time = line0_time + dt .timedelta (microseconds = float (time0_split [1 ]))
48
+ lefthand_file ,righthand_file = None ,None
49
+ for rh_file in righthand_files :
50
+ file_time_str = rh_file [- 19 :- 4 ]
51
+ file_time = dt .datetime .strptime (file_time_str ,'%Y%m%d-%H%M%S' )
52
+ if file_time > line0_time :
53
+ righthand_file = rh_file
54
+ break
55
+ for lh_file in lefthand_files :
56
+ file_time_str = lh_file [- 19 :- 4 ]
57
+ file_time = dt .datetime .strptime (file_time_str ,'%Y%m%d-%H%M%S' )
58
+ if file_time > line0_time :
59
+ lefthand_file = lh_file
60
+ break
61
+ curr_log_f .seek (0 )
62
+ lh_accel0 ,lh_gyro0 ,lh_gravity0 = find_starting_line (lh_file )
63
+ rh_accel0 ,rh_gyro0 ,rh_gravity0 = find_starting_line (rh_file )
64
+ # print lh_accel0,lh_gyro0,lh_gravity0
65
+ # print rh_accel0,rh_gyro0,rh_gravity0
66
+ curr_log_start_time ,curr_log_end_time = None ,None
67
+ for line in curr_log_f :
68
+ line_split = line .split ()
69
+ if line_split [- 1 ] in char_set :
70
+ time_split = line_split [0 ].split (':' )
71
+ curr_log_start_time = dt .datetime .fromtimestamp (float (time_split [0 ]))
72
+ curr_log_start_time = curr_log_start_time + dt .timedelta (microseconds = float (time_split [1 ]))
73
+ line_split [0 ]= time_split [0 ]+ time_split [1 ]
74
+ curr_log_keystrokes = [line_split ]
75
+ prev_end_time = curr_log_start_time
76
+ for end_line in curr_log_f :
77
+ end_line_split = end_line .split ()
78
+ end_time_split = end_line_split [0 ].split (':' )
79
+ curr_end_time = dt .datetime .fromtimestamp (float (end_time_split [0 ]))
80
+ curr_end_time = curr_end_time + dt .timedelta (microseconds = float (end_time_split [1 ]))
81
+ if (curr_end_time - prev_end_time )> log_buffer and end_line_split [- 1 ] not in char_set :
82
+ curr_log_end_time = prev_end_time
83
+ break
84
+ prev_end_time = curr_end_time
85
+ end_line_split [0 ]= end_time_split [0 ]+ end_time_split [1 ]
86
+ curr_log_keystrokes .append (line_split )
87
+ curr_lefthand_gyro .append (get_chunk_data (lh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,lh_gyro0 ))
88
+ curr_lefthand_accel .append (get_chunk_data (lh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,lh_accel0 ))
89
+ curr_lefthand_gravity .append (get_chunk_data (lh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,lh_gravity0 ))
90
+ curr_righthand_gyro .append (get_chunk_data (rh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,rh_gyro0 ))
91
+ curr_righthand_accel .append (get_chunk_data (rh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,rh_accel0 ))
92
+ curr_righthand_gravity .append (get_chunk_data (rh_file ,curr_log_start_time - log_buffer / 2 ,curr_log_end_time + log_buffer / 2 ,rh_gravity0 ))
93
+ final_log_keystrokes .append (curr_log_keystrokes )
94
+ final_lefthand_gyro .append (curr_lefthand_gyro )
95
+ final_lefthand_accel .append (curr_lefthand_accel )
96
+ final_lefthand_gravity .append (curr_lefthand_gravity )
97
+ final_righthand_gyro .append (curr_righthand_gyro )
98
+ final_righthand_accel .append (curr_righthand_accel )
99
+ final_righthand_gravity .append (curr_righthand_gravity )
100
+ print "Successfully completed" ,str (file_num + 1 ),"files"
101
+ return final_log_keystrokes ,final_lefthand_accel ,final_lefthand_gyro ,final_lefthand_gravity ,\
102
+ final_righthand_accel ,final_righthand_gyro ,final_righthand_gravity
103
+
104
+ def find_starting_line (filename ):
105
+ accel_start ,gyro_start ,gravity_start = None ,None ,None
106
+ accel_set ,gyro_set ,gravity_set = False ,False ,False
107
+ i = 0
108
+ with open (filename ,'rb' ) as lh_f :
109
+ curr_f = csv .reader (lh_f )
110
+ for row in curr_f :
111
+ if not accel_set and "accel" in row [0 ].lower ():
112
+ accel_start = i
113
+ accel_set = True
114
+ if not gyro_set and "gyro" in row [0 ].lower ():
115
+ gyro_start = i
116
+ gyro_set = True
117
+ if not gravity_set and "grav" in row [0 ].lower ():
118
+ gravity_start = i
119
+ gravity_set = True
120
+ i += 1
121
+ return accel_start ,gyro_start ,gravity_start
122
+
123
+ def get_chunk_data (filename , start_time , end_time , offset ):
124
+ result_set = []
125
+ with open (filename ,'rb' ) as f :
126
+ reader = csv .reader (f )
127
+ lines = list (reader )
128
+ for i in range (offset ,len (lines )):
129
+ curr_time_str = lines [i ][1 ]+ " " + lines [i ][2 ]
130
+ curr_time = dt .datetime .strptime (curr_time_str ,' %b %d %Y %H:%M:%S.%f' )
131
+ if curr_time > start_time and curr_time < end_time :
132
+ curr_set = [dt .datetime .strftime (curr_time ,'%s' )+ str (curr_time .microsecond )]
133
+ curr_set .extend (lines [i ][3 :])
134
+ result_set .append (curr_set )
135
+ if curr_time > end_time :
136
+ break
137
+ return result_set
0 commit comments