Skip to content

Commit c30daa4

Browse files
author
John Seekins
committed
Merge branch 'main' into auto-people
2 parents ab00e3b + 3e5b5b1 commit c30daa4

File tree

9 files changed

+91
-66
lines changed

9 files changed

+91
-66
lines changed

.github/workflows/ca-docker.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ on:
99
- "poetry.lock"
1010
- "Dockerfile.california"
1111
- "scrapers/ca/*"
12-
jobs:
12+
jobs:
1313
publish:
14-
steps:
14+
runs-on: ubuntu-latest
15+
steps:
1516
- uses: actions/checkout@v2
1617
- name: build california docker image
1718
uses: docker/[email protected]
@@ -22,4 +23,3 @@ jobs:
2223
dockerfile: /github/workspace/Dockerfile.california
2324
tag_with_ref: true
2425
tags: latest
25-
runs-on: ubuntu-18.04

.github/workflows/docker.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ on:
44
branches: [ main ]
55
tags:
66
- '*'
7-
jobs:
7+
jobs:
88
publish:
9-
steps:
9+
runs-on: ubuntu-latest
10+
steps:
1011
- uses: actions/checkout@v2
1112
- name: build docker image
1213
uses: docker/[email protected]
@@ -17,4 +18,3 @@ jobs:
1718
dockerfile: /github/workspace/Dockerfile
1819
tag_with_ref: true
1920
tags: latest
20-
runs-on: ubuntu-18.04

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ venv
3939

4040
# Binary data files and dumps, from Open States and state websites
4141
openstates.sqlite3
42+
openstates-postgres/
4243
dump/
4344
tmp.*
4445
*.mdb

Dockerfile

+30-14
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,53 @@
11
FROM python:3.9-slim
22
LABEL maintainer="James Turk <[email protected]>"
33

4-
ENV PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 PYTHONIOENCODING='utf-8' LANG='C.UTF-8'
4+
ENV PYTHONUNBUFFERED=1
5+
ENV PYTHONDONTWRITEBYTECODE=1
6+
ENV PYTHONIOENCODING='utf-8'
7+
ENV LANG='C.UTF-8'
58

6-
RUN apt update && apt install -y --no-install-recommends \
7-
git \
8-
build-essential \
9+
RUN apt-get update -qq \
10+
&& apt-get install -y -qq --no-install-recommends \
911
curl \
12+
wget \
1013
unzip \
14+
mdbtools \
15+
libpq5 \
16+
libgdal28 \
17+
build-essential \
18+
git \
1119
libssl-dev \
1220
libffi-dev \
1321
freetds-dev \
14-
python3-virtualenv \
1522
libxml2-dev \
1623
libxslt-dev \
1724
libyaml-dev \
1825
poppler-utils \
1926
libpq-dev \
2027
libgdal-dev \
2128
libgeos-dev \
22-
wget \
23-
unzip \
24-
# libcrypto1.1 \
25-
mdbtools && \
26-
rm -rf /var/lib/apt/lists/*
29+
&& apt-get clean \
30+
&& rm -rf /var/lib/apt/lists/*
2731

28-
ADD . /opt/openstates/openstates
32+
ADD poetry.lock /opt/openstates/openstates/
33+
ADD pyproject.toml /opt/openstates/openstates/
2934
WORKDIR /opt/openstates/openstates/
3035
ENV PYTHONPATH=./scrapers
3136

32-
RUN set -ex \
33-
&& pip install poetry \
34-
&& poetry install
37+
# the last step cleans out temporarily downloaded artifacts for poetry, shrinking our build
38+
RUN pip --no-cache-dir --disable-pip-version-check install poetry \
39+
&& poetry install --no-root \
40+
&& apt-get remove -y -qq \
41+
build-essential \
42+
git \
43+
libpq-dev \
44+
&& apt-get autoremove -y -qq \
45+
&& apt-get clean \
46+
&& rm -rf /var/lib/apt/lists/*
47+
48+
ADD . /opt/openstates/openstates/
49+
RUN poetry install \
50+
&& rm -r /root/.cache/pypoetry/cache /root/.cache/pypoetry/artifacts/
3551

3652
ENV OPENSSL_CONF=/opt/openstates/openstates/openssl.cnf
3753

Dockerfile.california

+32-17
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,55 @@
11
FROM python:3.9-slim
22
LABEL maintainer="James Turk <[email protected]>"
33

4-
ENV PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 PYTHONIOENCODING='utf-8' LANG='C.UTF-8'
4+
ENV PYTHONUNBUFFERED=1
5+
ENV PYTHONDONTWRITEBYTECODE=1
6+
ENV PYTHONIOENCODING='utf-8'
7+
ENV LANG='C.UTF-8'
58

6-
RUN apt update && apt install -y --no-install-recommends \
7-
git \
8-
build-essential \
9+
RUN apt-get update -qq \
10+
&& apt-get install -y -qq --no-install-recommends \
911
curl \
12+
wget \
1013
unzip \
14+
mdbtools \
15+
libpq5 \
16+
libgdal28 \
17+
libmariadb-dev \
18+
mariadb-server \
19+
mariadb-client \
20+
build-essential \
21+
git \
1122
libssl-dev \
1223
libffi-dev \
1324
freetds-dev \
14-
python3-virtualenv \
1525
libxml2-dev \
1626
libxslt-dev \
1727
libyaml-dev \
1828
poppler-utils \
1929
libpq-dev \
2030
libgdal-dev \
2131
libgeos-dev \
22-
wget \
23-
unzip \
24-
# libcrypto1.1 \
25-
libmariadb-dev \
26-
mariadb-server \
27-
mariadb-client \
28-
mdbtools && \
29-
rm -rf /var/lib/apt/lists/*
32+
&& apt-get clean \
33+
&& rm -rf /var/lib/apt/lists/*
3034

31-
ADD . /opt/openstates/openstates
35+
ADD poetry.lock /opt/openstates/openstates/
36+
ADD pyproject.toml /opt/openstates/openstates/
3237
WORKDIR /opt/openstates/openstates/
3338
ENV PYTHONPATH=./scrapers
3439

35-
RUN set -ex \
36-
&& pip install poetry \
37-
&& poetry install --extras "california"
40+
# the last step cleans out temporarily downloaded artifacts for poetry, shrinking our build
41+
RUN pip --no-cache-dir --disable-pip-version-check install poetry \
42+
&& poetry install \
43+
&& rm -r /root/.cache/pypoetry/cache /root/.cache/pypoetry/artifacts/ \
44+
&& apt-get remove -y -qq \
45+
build-essential \
46+
git \
47+
libpq-dev \
48+
&& apt-get autoremove -y -qq \
49+
&& apt-get clean \
50+
&& rm -rf /var/lib/apt/lists/*
51+
52+
ADD . /opt/openstates/openstates/
3853

3954
ENV OPENSSL_CONF=/opt/openstates/openstates/openssl.cnf
4055

docker-compose.yml

+15-20
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,6 @@ networks:
44
openstates-network:
55
name: openstates-network
66

7-
volumes:
8-
openstates-postgres:
9-
name: openstates-postgres
10-
external: true
11-
127
services:
138
scrape:
149
build: .
@@ -22,30 +17,30 @@ services:
2217
- VIRGINIA_FTP_USER
2318
- VIRGINIA_FTP_PASSWORD
2419
volumes:
25-
- .:/opt/openstates/openstates/
20+
- .:/opt/openstates/openstates/
2621
entrypoint: ["poetry", "run", "os-update"]
2722
networks:
28-
- openstates-network
23+
- openstates-network
2924
mysql:
3025
image: mariadb:10.3
3126
# command: mysqld_safe --max_allowed_packet=512M
3227
ports:
33-
- "3306:3306"
28+
- "3306:3306"
3429
environment:
35-
- MYSQL_DATABASE=capublic
36-
- MYSQL_ALLOW_EMPTY_PASSWORD=yes
30+
- MYSQL_DATABASE=capublic
31+
- MYSQL_ALLOW_EMPTY_PASSWORD=yes
3732
networks:
38-
- openstates-network
33+
- openstates-network
3934
ca-scrape:
4035
build:
4136
context: .
4237
dockerfile: Dockerfile.california
4338
environment:
44-
- MYSQL_HOST=mysql
45-
- DATABASE_URL=postgres://openstates:openstates@db/openstatesorg
46-
- PYTHONPATH=./scrapers
39+
- MYSQL_HOST=mysql
40+
- DATABASE_URL=postgres://openstates:openstates@db/openstatesorg
41+
- PYTHONPATH=./scrapers
4742
volumes:
48-
- .:/opt/openstates/openstates/
43+
- .:/opt/openstates/openstates/
4944
entrypoint: ["poetry", "run", "os-update"]
5045
networks:
5146
- openstates-network
@@ -55,13 +50,13 @@ services:
5550
dockerfile: Dockerfile.california
5651
entrypoint: /opt/openstates/openstates/scrapers/ca/download.sh
5752
environment:
58-
- MYSQL_HOST=mysql
53+
- MYSQL_HOST=mysql
5954
volumes:
60-
- .:/opt/openstates/openstates/
55+
- .:/opt/openstates/openstates/
6156
depends_on:
62-
- mysql
57+
- mysql
6358
networks:
64-
- openstates-network
59+
- openstates-network
6560
db:
6661
image: "mdillon/postgis:11-alpine"
6762
hostname: "db"
@@ -72,6 +67,6 @@ services:
7267
POSTGRES_USER: openstates
7368
POSTGRES_DB: openstatesorg
7469
volumes:
75-
- openstates-postgres:/var/lib/postgresql/data
70+
- ./openstates-postgres/:/var/lib/postgresql/data
7671
networks:
7772
- openstates-network

scrapers/utils/__init__.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
from .lxmlize import LXMLMixin # noqa
44
from .lxmlize import url_xpath # noqa
55

6+
_phone_pattern = re.compile(r"\(?\d{3}\)?\s?-?\d{3}-?\d{4}")
7+
_email_pattern = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\." r"[a-zA-Z]{2,}\b")
8+
69

710
def validate_phone_number(phone_number):
811
is_valid = False
912

1013
# Phone format validation regex.
11-
phone_pattern = re.compile(r"\(?\d{3}\)?\s?-?\d{3}-?\d{4}")
12-
phone_match = phone_pattern.match(phone_number)
14+
phone_match = _phone_pattern.match(phone_number)
1315
if phone_match is not None:
1416
is_valid = True
1517

@@ -19,10 +21,7 @@ def validate_phone_number(phone_number):
1921
def validate_email_address(email_address):
2022
is_valid = False
2123

22-
email_pattern = re.compile(
23-
r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\." r"[a-zA-Z]{2,}\b"
24-
)
25-
email_match = email_pattern.match(email_address)
24+
email_match = _email_pattern.match(email_address)
2625
if email_match is not None:
2726
is_valid = True
2827

scrapers/utils/actions.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ def __new__(
2929
for regex in regexes:
3030
if isinstance(regex, string_types):
3131
if flexible_whitespace:
32-
regex = re.sub(r"\s{1,4}", r"\\s{,10}", regex)
33-
compiled_regexes.append(re.compile(regex))
32+
c_regex = re.sub(r"\s{1,4}", r"\\s{,10}", regex)
33+
compiled_regexes.append(re.compile(c_regex))
3434
else:
3535
compiled_regexes.append(regex)
3636

scripts/init-db.sh

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ unset DATABASE_URL
55

66
# stop database and remove volume
77
docker-compose down
8-
docker volume rm openstates-postgres || true
98
docker-compose up -d db
109
sleep 3
1110

0 commit comments

Comments
 (0)