Skip to content

Commit c52e6be

Browse files
authoredJul 5, 2021
Merge pull request #1 from DriftAsimov/main
Experimental Python Doc Scraper
2 parents 891e39a + 0715b0c commit c52e6be

File tree

1 file changed

+45
-84
lines changed

1 file changed

+45
-84
lines changed
 

‎brainfeed-soup.ipynb ‎souppy.ipynb

+45-84
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,19 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
6-
"id": "1a6cdb3f-759a-4e8c-a351-79934a29c1b4",
5+
"execution_count": null,
76
"metadata": {},
87
"outputs": [],
98
"source": [
109
"# Imports\n",
1110
"from bs4 import BeautifulSoup\n",
12-
"from requests import Session"
11+
"from requests import Session\n",
12+
"from urllib.parse import urlparse"
1313
]
1414
},
1515
{
1616
"cell_type": "code",
17-
"execution_count": 3,
18-
"id": "841f5dcf-80da-4fea-b866-94af3c490fe4",
17+
"execution_count": null,
1918
"metadata": {},
2019
"outputs": [],
2120
"source": [
@@ -116,8 +115,7 @@
116115
},
117116
{
118117
"cell_type": "code",
119-
"execution_count": 58,
120-
"id": "716bc3dd-1d51-447b-ac13-61da1d859116",
118+
"execution_count": null,
121119
"metadata": {},
122120
"outputs": [],
123121
"source": [
@@ -126,8 +124,7 @@
126124
},
127125
{
128126
"cell_type": "code",
129-
"execution_count": 59,
130-
"id": "816e5cdc-fd61-41f0-8c11-d72cea22d1a4",
127+
"execution_count": null,
131128
"metadata": {},
132129
"outputs": [],
133130
"source": [
@@ -136,8 +133,7 @@
136133
},
137134
{
138135
"cell_type": "code",
139-
"execution_count": 60,
140-
"id": "36610e53-f418-4ee5-a9ab-07a68e73f65f",
136+
"execution_count": null,
141137
"metadata": {},
142138
"outputs": [],
143139
"source": [
@@ -147,8 +143,7 @@
147143
},
148144
{
149145
"cell_type": "code",
150-
"execution_count": 61,
151-
"id": "9d29e929-92d7-4024-9601-b04c384f9435",
146+
"execution_count": null,
152147
"metadata": {},
153148
"outputs": [],
154149
"source": [
@@ -157,8 +152,7 @@
157152
},
158153
{
159154
"cell_type": "code",
160-
"execution_count": 120,
161-
"id": "002dde50-0148-4278-89b0-00d9e8e0d1c2",
155+
"execution_count": null,
162156
"metadata": {},
163157
"outputs": [],
164158
"source": [
@@ -167,8 +161,7 @@
167161
},
168162
{
169163
"cell_type": "code",
170-
"execution_count": 121,
171-
"id": "ba80d72f-618f-4124-b7fc-c7b8c962afcf",
164+
"execution_count": null,
172165
"metadata": {},
173166
"outputs": [],
174167
"source": [
@@ -185,8 +178,7 @@
185178
},
186179
{
187180
"cell_type": "code",
188-
"execution_count": 122,
189-
"id": "fedb9f43-c929-42e6-a8f5-0c2e41c7849e",
181+
"execution_count": null,
190182
"metadata": {},
191183
"outputs": [],
192184
"source": [
@@ -195,8 +187,7 @@
195187
},
196188
{
197189
"cell_type": "code",
198-
"execution_count": 141,
199-
"id": "b03b0e44-3364-4180-b6ad-b8ec3ed3d46a",
190+
"execution_count": null,
200191
"metadata": {},
201192
"outputs": [],
202193
"source": [
@@ -206,8 +197,16 @@
206197
},
207198
{
208199
"cell_type": "code",
209-
"execution_count": 142,
210-
"id": "fff3107b-dddf-4fab-b3f3-4807510d63ea",
200+
"execution_count": null,
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"url = urlparse(url).geturl().replace(\"\\\\\", \"/\")"
205+
]
206+
},
207+
{
208+
"cell_type": "code",
209+
"execution_count": null,
211210
"metadata": {},
212211
"outputs": [],
213212
"source": [
@@ -216,98 +215,60 @@
216215
},
217216
{
218217
"cell_type": "code",
219-
"execution_count": 143,
220-
"id": "2642b64b-c216-4ff9-b2db-359037d28582",
218+
"execution_count": null,
221219
"metadata": {},
222220
"outputs": [],
223221
"source": [
224-
"soup = BeautifulSoup(res2.text)"
222+
"soup = BeautifulSoup(res2.text, \"html.parser\")"
225223
]
226224
},
227225
{
228226
"cell_type": "code",
229-
"execution_count": 147,
230-
"id": "8e9736cf-382a-43f7-a404-282b23b29b6d",
227+
"execution_count": null,
231228
"metadata": {},
232-
"outputs": [
233-
{
234-
"data": {
235-
"text/plain": [
236-
"'Raised when indentation contains an inconsistent use of tabs and spaces.\\nThis is a subclass of'"
237-
]
238-
},
239-
"execution_count": 147,
240-
"metadata": {},
241-
"output_type": "execute_result"
242-
}
243-
],
229+
"outputs": [],
244230
"source": [
245-
"e = soup.find(id=id_)\n",
246-
"e.parent.p.contents[0].strip()"
231+
"base_element = soup.find(\"dt\", {'id' : id_})\r\n",
232+
"base_parent = base_element.parent"
247233
]
248234
},
249235
{
250236
"cell_type": "code",
251-
"execution_count": 148,
252-
"id": "5fda45f4-b36e-4ca7-8921-1a30ae26849f",
237+
"execution_count": null,
253238
"metadata": {},
254-
"outputs": [
255-
{
256-
"data": {
257-
"text/plain": [
258-
"'TabError'"
259-
]
260-
},
261-
"execution_count": 148,
262-
"metadata": {},
263-
"output_type": "execute_result"
264-
}
265-
],
239+
"outputs": [],
266240
"source": [
267-
"id_"
241+
"para = base_parent.find(\"dd\").text"
268242
]
269243
},
270244
{
271245
"cell_type": "code",
272-
"execution_count": 149,
273-
"id": "e6ebee8c-3ce6-4f2e-80e1-4d4002066a58",
246+
"execution_count": null,
274247
"metadata": {},
275-
"outputs": [
276-
{
277-
"data": {
278-
"text/plain": [
279-
"'Raised when indentation contains an inconsistent use of tabs and spaces.\\nThis is a subclass of IndentationError.'"
280-
]
281-
},
282-
"execution_count": 149,
283-
"metadata": {},
284-
"output_type": "execute_result"
285-
}
286-
],
248+
"outputs": [],
287249
"source": [
288-
"e.parent.p.text"
250+
"print(para)"
289251
]
290252
}
291253
],
292254
"metadata": {
255+
"interpreter": {
256+
"hash": "63fd5069d213b44bf678585dea6b12cceca9941eaf7f819626cde1f2670de90d"
257+
},
293258
"kernelspec": {
294-
"display_name": "Python 3",
295-
"language": "python",
259+
"display_name": "Python 3.9.5 64-bit",
296260
"name": "python3"
297261
},
298262
"language_info": {
299-
"codemirror_mode": {
300-
"name": "ipython",
301-
"version": 3
302-
},
303-
"file_extension": ".py",
304-
"mimetype": "text/x-python",
305263
"name": "python",
306-
"nbconvert_exporter": "python",
307-
"pygments_lexer": "ipython3",
308-
"version": "3.9.5"
264+
"version": ""
265+
},
266+
"metadata": {
267+
"interpreter": {
268+
"hash": "63fd5069d213b44bf678585dea6b12cceca9941eaf7f819626cde1f2670de90d"
269+
}
309270
}
310271
},
311272
"nbformat": 4,
312273
"nbformat_minor": 5
313-
}
274+
}

0 commit comments

Comments
 (0)
Please sign in to comment.