This repository was archived by the owner on Feb 18, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparsetools.py
200 lines (164 loc) · 5.54 KB
/
parsetools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Various Functions used to extract data from arbitrary text files
"""
import re
import copy
def parse_flags(string, startflag, endflag, reflags=re.S):
"""Extracts raw lines of data within regex flags"""
pattern = r"{}(.*?){}".format(startflag, endflag)
return re.search(pattern, string, flags=reflags).group(1)
def sanitize_item(string):
"""Converts datatypes and removes whitespace"""
string = string.strip()
intpattern = r'^-?\d+$'
floatpattern = r'^(-?\d+(\.\d+)?)([^\d]([+-]?\d\d))?$'
if re.match(intpattern, string):
return int(string)
elif re.match(floatpattern, string):
match = re.match(floatpattern, string)
if not match.group(4):
return float(match.group(1))
return float(match.group(1))*10**(int(match.group(4)))
else:
return string
def sanitize_items(list_):
"""runs sanitize_item for a list"""
newlist = []
for item in list_:
newlist.append(sanitize_item(item))
return newlist
def sanitize_list(string):
"""Breaks list of items into python list"""
dirtylist = string.split()
cleanlist = []
for item in dirtylist:
cleanlist.append(sanitize_item(item))
return cleanlist
def parse_array(string):
"""Interprets data in form
1 2 3
# # #
# # #
# # #
Found in gaussian files
"""
lines = string.split("\n")
# remove empty lines
for idx, line in enumerate(lines):
if line == "" or re.match(r"^\s*$", line):
del lines[idx]
d_array = {}
final_array = []
for idx, line in enumerate(lines):
whitespace = re.match(r"\s*(?!\s)", line).group()
if idx == 0:
title_whitespace = whitespace
array_index = sanitize_list(line)
elif whitespace == title_whitespace:
array_index = line.split()
else:
line = sanitize_list(line)
for idx, item in enumerate(reversed(array_index)):
if item not in d_array:
d_array[item] = []
d_array[item].append(line[-(idx+1)])
keylist = []
for key in d_array:
try:
keylist.append(int(key))
except:
return d_array
for key in sorted(keylist):
final_array.append(d_array[str(key)])
return final_array
def equiv_line(string, name):
"""Extracts simple data when there is only one number
on the line
"""
pattern = r"({}.*?)(\d(.\d*)?).*\n".format(name)
return re.search(pattern, string).group(2)
def multi_equiv_line(string):
"""Extracts data with multiple declarations on one line
Ex.
desc1= num1 desc2= num2 desc3=-num3
desc4= num4
"""
dict_ = {}
list_ = string.split()
keys = []
values = []
itemmarker = 0
# Fixes case where equal sign is not at end of string due to negative number
new_list = []
for item in list_:
if re.search("=-", item):
tempstring = item.split('=')
new_list.append(tempstring[0]+'=')
new_list.append(tempstring[1])
else:
new_list.append(item)
list_ = []
list_ = new_list
###
for i in range(len(list_)):
if list_[i][-1] == "=":
if i == 0:
temp_list = list_[0].strip('=')
else:
temp_list = list_[itemmarker:(i-1)]
temp_list.append(list_[i].strip('='))
itemmarker = i+1
keys.append("".join(temp_list))
values.append(list_[i+1])
i = i+2
for idx, key in enumerate(keys):
dict_[key] = values[idx]
return dict_
def parse_table(string, titles):
"""Parses Tables in form
---------------
Column Titles
---------------
Row 1 Data
Row 2 Data
---------------
"""
sub_dict = {}
string = re.split('[-]+\n', string)[-2]
rows = string.split('\n')[0:-1]
for ridx, row in enumerate(rows):
items = row.split()
sub_dict['row{}'.format(ridx)] = {}
for tidx, title in enumerate(titles):
sub_dict['row{}'.format(ridx)][title] = sanitize_item(items[tidx])
return sub_dict
def dict_filter(olddict, excludekeys):
"""Makes new dict excluding keys"""
return {x: olddict[x] for x in olddict if x not in excludekeys}
def dict_snip(olddict, keepkeys):
"""Makes new dict with only certain keys"""
return {x: olddict[x] for x in olddict if x in keepkeys}
def dict_dupes(main, compare):
"""returns a dictionary of the duplicate values of two input dictionaries. Can work with nested dictionaries.
Assumes that values of the same key will also be the same type."""
def recursivedelete(main, compare, duplicates):
for item in compare:
if item in main:
if isinstance(main[item], dict):
duplicates[item] = {}
recursivedelete(main[item], compare[item], duplicates[item])
else:
if main[item] == compare[item]:
duplicates[item] = main[item]
localmain = copy.deepcopy(main)
duplicates = None
duplicates = {}
recursivedelete(localmain, compare, duplicates)
return duplicates
def identity(arg):
"""Does Nothing"""
return arg
def main_parse(string, startflag, endflag, reflags=re.S, parse_type=identity):
"""Used to send a string through parseflags and another function"""
raw_string = parse_flags(string, startflag, endflag, reflags)
return parse_type(raw_string)