Skip to content

Commit 988015b

Browse files
author
sshleifer
committedJan 16, 2018
Cleanup Readme
1 parent dcb4983 commit 988015b

File tree

6 files changed

+89
-82
lines changed

6 files changed

+89
-82
lines changed
 

‎README.md

+12-13
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# charactr
2-
Mac/iMessage users can interactively visualize their
3-
texting history, using pandas, [d3.js] (http://d3js.org/) and [crossfilter.js](http://square.github.io/crossfilter/).
2+
iPhone/iMessage users can interactively visualize their
3+
texting history, using pandas, [d3.js](http://d3js.org/) and [crossfilter.js](http://square.github.io/crossfilter/).
44

5-
You must have a mac with iMessage installed. It may also work without iMessage but with an itunes backup.
5+
You must have a mac with either iMessage or itunes backups of your iphone.
66

7-
## From command line:
8-
- $ `git clone git@github.com:sshleifer/imsg_stats.git`
9-
- $ `cd imsg_stats`
10-
- $ `export PYTHONPATH=PYTHONPATH:"."`
11-
- $ `python app/run.py` to see the charts (you may need to adjust your path)
12-
- If that failed, try $ `pip install -r requirements.txt`
7+
## Instructions from command line:
8+
- `git clone git@github.com:sshleifer/charactr.git`
9+
- `cd charactr`
10+
- `export PYTHONPATH=PYTHONPATH:"."`
11+
- `python app/run.py`
12+
- If that failed, try `pip install -r requirements.txt`
1313

1414

1515

@@ -32,12 +32,11 @@ issue or email to sshleifer at gmail dot com.
3232

3333

3434
## Saving Your Data (without viewing charts)
35-
- $ `git clone` **this repo**
36-
- $ `python scripts/save_data.py`
37-
- $ data will be written to `imsg_stats/msg.csv`
35+
- `python scripts/save_data.py`
36+
- data will be written to `imsg_stats/msg.csv`
3837

3938

40-
# Privacy
39+
## Privacy
4140
- we dont have a server. No data leaves your local machine!
4241

4342
## Future Plans

‎app/chat_to_csv.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import sqlite3
77
from sys import argv
88

9-
from app.contacts import addresses, groupbyContact
9+
from app.contacts import addresses, agg_by_contact
1010
from app.word_cloud import writeWords
11-
from app.helpers.utils import filterDF, msgLen, checkSavedData, concatSaved
11+
from app.helpers.utils import filter_based_on_col, safe_msg_len, check_saved_data, concat_saved_data
1212
from app.time_chart import timePanel
1313

1414
CHAT_DB = os.path.expanduser("~/Library/Messages/chat.db")
@@ -47,7 +47,7 @@ def make_dataframe_from_db_files(db_path):
4747
#date_cut = lambda x: dt.datetime.fromtimestamp(x + DATE_OFFSET_THAT_SOMEHOW_WORKS)
4848
msg['tstamp'] = msg.date.apply(date_converter)
4949
msg['day'] = msg.tstamp.apply(lambda x: x.date())
50-
msg['msg_len'] = msg.text.fillna('').apply(msgLen)
50+
msg['msg_len'] = msg.text.fillna('').apply(safe_msg_len)
5151
return msg
5252

5353

@@ -67,16 +67,18 @@ def query_all_possible_sources(test_path=None):
6767
SAVE_DIR = 'csv/'
6868
def concat_and_deduplicate_history(saved_data=[]):
6969
'''combine and deduplicate the various db reads'''
70-
msg = pd.concat(query_all_possible_sources()).drop_duplicates(subset=['day', 'chat_identifier', 'text'])
70+
msg = pd.concat(query_all_possible_sources()).drop_duplicates(
71+
subset=['day', 'chat_identifier', 'text']
72+
)
7173
clist = addresses(msg)
72-
def findName(cid):
74+
def find_name(cid):
7375
cid = cid.replace('+1','')
7476
try:
7577
return clist[cid].rstrip()
7678
except KeyError:
7779
return cid.rstrip()
78-
msg['cname'] = msg.chat_identifier.apply(findName)
79-
return concatSaved(msg,saved_data) if saved_data else msg
80+
msg['cname'] = msg.chat_identifier.apply(find_name)
81+
return concat_saved_data(msg, saved_data) if saved_data else msg
8082

8183

8284
def try_df_to_csv(df, path):
@@ -89,12 +91,12 @@ def try_df_to_csv(df, path):
8991
def create_csvs(hidegroups=True, use_saved=False, n_best=10):
9092
'''Create the relevant csvs'''
9193
print "being executed at", os.path.abspath('.')
92-
saved_data = checkSavedData() if use_saved else []
94+
saved_data = check_saved_data() if use_saved else []
9395
msg = concat_and_deduplicate_history(saved_data)
9496
print msg.shape
9597
if len(argv) <= 1 or hidegroups:
96-
msg = filterDF(msg, 'cname', lambda x: not x.startswith('chat'))
97-
ppl = groupbyContact(msg.copy()).sort_values('totlen', ascending=False)
98+
msg = filter_based_on_col(msg, 'cname', lambda x: not x.startswith('chat'))
99+
ppl = agg_by_contact(msg.copy()).sort_values('totlen', ascending=False)
98100
print ppl.head()
99101
besties = map(lambda x: x.rstrip(), ppl.index[:n_best])
100102
print 'besties:', besties

‎app/contacts.py

+37-34
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
'''Reads in addresses from DB stored at path, or backup,
22
to label phone numbers.'''
3-
from helpers.utils import filterDF
3+
from helpers.utils import filter_based_on_col
44
from os.path import expanduser as eu
55
import os
66
import pandas as pd
@@ -14,7 +14,7 @@
1414
MO_PTH = '31bb7ba8914766d4ba40d6dfb6113c8b614be442'
1515
MO_BASE = eu('~/Library/Application Support/MobileSync/Backup/')
1616

17-
def extractContacts(path):
17+
def extract_contacts(path):
1818
'''Get Contact Data from PHONENUMBER, RECORD Tables. As in icloud_query.py'''
1919
try:
2020
ad_db = sqlite3.connect(path)
@@ -30,60 +30,63 @@ def extractContacts(path):
3030
print "Non-fatal DB error ON path: ", path
3131
return {}
3232

33-
def extractBackupContacts(path):
34-
'''makes {number: name} dict from iphone backup '''
35-
try:
36-
db = sqlite3.connect(path)
37-
sql = 'SELECT c15Phone, c0First, c1Last, c6Organization from ABPersonFullTextSearch_content'
38-
jn = pd.read_sql(sql, db)
3933

40-
jn = jn.applymap(lambda x: '' if x == None else x)
41-
clean = lambda x: filter(lambda y: '0' <= c <= '9', x)[-17:-7]
42-
clist = {x[0]: x[1] + ' ' + x[2] for x in
43-
zip(jn.c15Phone.apply(clean), jn.c0First, jn.c1Last)}
44-
return clist, True # always return true?
45-
# want better except
46-
except Exception:
47-
print "Error in extractBackupContacts with path: ", path
48-
return {}, False
34+
def extract_backup_contacts(path):
35+
'''makes {number: name} dict from iphone backup '''
36+
try:
37+
db = sqlite3.connect(path)
38+
sql = 'SELECT c15Phone, c0First, c1Last, c6Organization from ABPersonFullTextSearch_content'
39+
jn = pd.read_sql(sql, db)
4940

41+
jn = jn.applymap(lambda x: '' if x == None else x)
42+
clean = lambda x: filter(lambda y: '0' <= c <= '9', x)[-17:-7]
43+
clist = {x[0]: x[1] + ' ' + x[2] for x in
44+
zip(jn.c15Phone.apply(clean), jn.c0First, jn.c1Last)}
45+
return clist, True # always return true?
46+
# want better except
47+
except Exception:
48+
print "Error in extractBackupContacts with path: ", path
49+
return {}, False
5050

51-
def groupNames(msg, clist):
52-
'''Currently unused attempt to aggregate group chats.'''
53-
chats = filterDF(msg, 'chat_id', lambda x: x.startswith('chat'))
54-
gb = chats.groupby('chat_id')
55-
tmp = gb.id.agg(lambda x: list(set(x)))
56-
tmp = dict(zip(tmp.index, tmp.values))
57-
def findName(cid):
58-
cid = cid.replace('+1','')
59-
try:
60-
return clist[cid].split(' ')[0]
61-
except KeyError:
62-
return cid.rstrip()
63-
return { k: ','.join([findName(x[1:].lstrip('1')) for x in v]) for k,v in
51+
52+
def name_groupchat(msg, clist):
53+
'''Currently unused attempt to aggregate group chats.'''
54+
chats = filter_based_on_col(msg, 'chat_id', lambda x: x.startswith('chat'))
55+
gb = chats.groupby('chat_id')
56+
tmp = gb.id.agg(lambda x: list(set(x)))
57+
tmp = dict(zip(tmp.index, tmp.values))
58+
def findName(cid):
59+
cid = cid.replace('+1','')
60+
try:
61+
return clist[cid].split(' ')[0]
62+
except KeyError:
63+
return cid.rstrip()
64+
return { k: ','.join([findName(x[1:].lstrip('1')) for x in v]) for k,v in
6465
tmp.iteritems()}
6566

67+
6668
def addresses(msg=[]):
6769
'''create the {number: name} dictionary from contacts app, or phone backup.'''
6870
success = False # so we only call extractBackupContacts once
69-
contact_list = extractContacts(COMP_PATH)
71+
contact_list = extract_contacts(COMP_PATH)
7072
paths = filter(lambda x: MO_PTH in x[2], os.walk(MO_BASE))
7173
backups = [os.path.join(MO_BASE, x[0], MO_PTH) for x in paths]
7274
if os.path.exists(SRCS):
7375
backups.extend([os.path.join(SRCS,mid,ENDING) for mid in os.listdir(SRCS)])
7476
for bu in backups:
7577
if not success and MO_PTH in bu:
76-
new_cdict, success = extractBackupContacts(bu)
78+
new_cdict, success = extract_backup_contacts(bu)
7779
contact_list.update(new_cdict)
7880
else:
79-
contact_list.update(extractContacts(bu))
81+
contact_list.update(extract_contacts(bu))
8082
if not contact_list:
8183
print "Contacts: checked", COMP_PATH, backups
8284
print "NO CONTACTS FOUND"
8385
#contact_list = groupNames(msg, contact_list)
8486
return contact_list
8587

86-
def groupbyContact(msg):
88+
89+
def agg_by_contact(msg):
8790
'''Group conversations by contact, and calculate summary stats.
8891
The data that underlies the scatter plot.'''
8992
msg['snt_chars'] = msg['is_sent'] * msg['msg_len']

‎app/helpers/utils.py

+24-20
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
import pandas as pd
2+
import datetime as dt
23

3-
def msgLen(text):
4-
return len(text) if text else 0
5-
6-
def filterDF(df, col, bool_func):
7-
df['keep'] = df[col].apply(bool_func)
8-
return df[df.keep].drop('keep', 1)
9-
10-
def checkSavedData():
11-
'''Get saved data if it exists.'''
12-
keep = ['ROWID_x','text','tstamp','chat_id','is_sent','cname']
13-
return pd.read_csv('msg.csv')[keep] if os.path.exists('msg.csv') else []
14-
15-
def concatSaved(msg, saved_data):
16-
'''Adds saved data to extracted data, if saved data exists.'''
17-
str2date = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
18-
saved_data['tstamp'] = saved_data.tstamp.apply(str2date)
19-
if 'msg_len' not in saved_data.columns:
20-
saved_data['msg_len'] = saved_data.text.fillna(0).apply(msgLen)
21-
cutoff = saved_data.tstamp.max()
22-
return pd.concat([saved_data, msg[msg.tstamp > cutoff]])
4+
5+
def safe_msg_len(text):
6+
return len(text) if text else 0
7+
8+
9+
def filter_based_on_col(df, col, bool_func):
10+
return df.loc[df[col].apply(bool_func)]
11+
12+
13+
def check_saved_data():
14+
'''Get saved data if it exists.'''
15+
keep = ['ROWID_x','text','tstamp','chat_id','is_sent','cname']
16+
return pd.read_csv('msg.csv')[keep] if os.path.exists('msg.csv') else []
17+
18+
19+
def concat_saved_data(msg, saved_data):
20+
'''Adds saved data to extracted data, if saved data exists.'''
21+
str2date = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
22+
saved_data['tstamp'] = saved_data.tstamp.apply(str2date)
23+
if 'msg_len' not in saved_data.columns:
24+
saved_data['msg_len'] = saved_data.text.fillna(0).apply(safe_msg_len)
25+
cutoff = saved_data.tstamp.max()
26+
return pd.concat([saved_data, msg[msg.tstamp > cutoff]])
2327

2428

‎app/time_chart.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# and the y axis is characters exchanged
44
import numpy as np
55
import pandas as pd
6-
from helpers.utils import filterDF
6+
from helpers.utils import filter_based_on_col
77
from types import *
88

99
def getSumStats(gb):
@@ -52,7 +52,7 @@ def timePanel(msg, besties=False, topn=10):
5252
assert isinstance(msg, pd.DataFrame)
5353
ts = byDate(msg, byContact=True)
5454
if not besties: besties = topN(ts, topn)
55-
ts = filterDF(ts, 'cname', lambda x: x in besties)[['cname','ymd','msg_len']]
55+
ts = filter_based_on_col(ts, 'cname', lambda x: x in besties)[['cname', 'ymd', 'msg_len']]
5656
ts.columns = ['key','date','value']
5757
datestr = lambda x: str(x.date())
5858
full_range = map(datestr, pd.date_range(ts.date.min(), ts.date.max()))

‎requirements.txt

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
numpy=1.9
2-
scipy
3-
pandas
1+
pandas==0.19.0
2+
numpy

0 commit comments

Comments
 (0)
Please sign in to comment.