Cleanup Readme

sshleifer · sshleifer · commit 988015b25524 · 2018-01-15T19:19:34.000-05:00
diff --git a/README.md b/README.md
@@ -1,15 +1,15 @@
 # charactr
-Mac/iMessage users can interactively visualize their
-texting history, using pandas, [d3.js] (http://d3js.org/) and [crossfilter.js](http://square.github.io/crossfilter/).
+iPhone/iMessage users can interactively visualize their
+texting history, using pandas, [d3.js](http://d3js.org/) and [crossfilter.js](http://square.github.io/crossfilter/).
 
-You must have a mac with iMessage installed. It may also work without iMessage but with an itunes backup.
+You must have a mac with either iMessage or itunes backups of your iphone.
 
-## From command line:
-- $ `git clone git@github.com:sshleifer/imsg_stats.git`
-- $ `cd imsg_stats`
-- $ `export PYTHONPATH=PYTHONPATH:"."`
-- $ `python app/run.py` to see the charts (you may need to adjust your path)
-- If that failed, try $ `pip install -r requirements.txt`
+## Instructions from command line:
+- `git clone git@github.com:sshleifer/charactr.git`
+- `cd charactr`
+- `export PYTHONPATH=PYTHONPATH:"."`
+- `python app/run.py`
+- If that failed, try `pip install -r requirements.txt`
 
 
 
@@ -32,12 +32,11 @@ issue or email to sshleifer at gmail dot com.
 
 
 ## Saving Your Data (without viewing charts)
-- $ `git clone` **this repo**
-- $ `python scripts/save_data.py`
-- $ data will be written to `imsg_stats/msg.csv`
+- `python scripts/save_data.py`
+- data will be written to `imsg_stats/msg.csv`
 
 
-# Privacy
+## Privacy
 - we dont have a server. No data leaves your local machine!
 
 ## Future Plans
diff --git a/app/chat_to_csv.py b/app/chat_to_csv.py
@@ -6,9 +6,9 @@
 import sqlite3
 from sys import argv
 
-from app.contacts import addresses, groupbyContact
+from app.contacts import addresses, agg_by_contact
 from app.word_cloud import writeWords
-from app.helpers.utils import filterDF, msgLen, checkSavedData, concatSaved
+from app.helpers.utils import filter_based_on_col, safe_msg_len, check_saved_data, concat_saved_data
 from app.time_chart import timePanel
 
 CHAT_DB = os.path.expanduser("~/Library/Messages/chat.db")
@@ -47,7 +47,7 @@ def make_dataframe_from_db_files(db_path):
     #date_cut = lambda x: dt.datetime.fromtimestamp(x + DATE_OFFSET_THAT_SOMEHOW_WORKS)
     msg['tstamp'] = msg.date.apply(date_converter)
     msg['day'] = msg.tstamp.apply(lambda x: x.date())
-    msg['msg_len'] = msg.text.fillna('').apply(msgLen)
+    msg['msg_len'] = msg.text.fillna('').apply(safe_msg_len)
     return msg
 
 
@@ -67,16 +67,18 @@ def query_all_possible_sources(test_path=None):
 SAVE_DIR = 'csv/'
 def concat_and_deduplicate_history(saved_data=[]):
     '''combine and deduplicate the various db reads'''
-    msg = pd.concat(query_all_possible_sources()).drop_duplicates(subset=['day', 'chat_identifier', 'text'])
+    msg = pd.concat(query_all_possible_sources()).drop_duplicates(
+        subset=['day', 'chat_identifier', 'text']
+    )
     clist = addresses(msg)
-    def findName(cid):
+    def find_name(cid):
         cid = cid.replace('+1','')
         try:
             return clist[cid].rstrip()
         except KeyError:
             return cid.rstrip()
-    msg['cname'] = msg.chat_identifier.apply(findName)
-    return concatSaved(msg,saved_data) if saved_data else msg
+    msg['cname'] = msg.chat_identifier.apply(find_name)
+    return concat_saved_data(msg, saved_data) if saved_data else msg
 
 
 def try_df_to_csv(df, path):
@@ -89,12 +91,12 @@ def try_df_to_csv(df, path):
 def create_csvs(hidegroups=True, use_saved=False, n_best=10):
     '''Create the relevant csvs'''
     print "being executed at", os.path.abspath('.')
-    saved_data = checkSavedData() if use_saved else []
+    saved_data = check_saved_data() if use_saved else []
     msg = concat_and_deduplicate_history(saved_data)
     print msg.shape
     if len(argv) <= 1 or hidegroups:
-        msg = filterDF(msg, 'cname', lambda x: not x.startswith('chat'))
-    ppl = groupbyContact(msg.copy()).sort_values('totlen', ascending=False)
+        msg = filter_based_on_col(msg, 'cname', lambda x: not x.startswith('chat'))
+    ppl = agg_by_contact(msg.copy()).sort_values('totlen', ascending=False)
     print ppl.head()
     besties = map(lambda x: x.rstrip(), ppl.index[:n_best])
     print 'besties:', besties
diff --git a/app/contacts.py b/app/contacts.py
@@ -1,6 +1,6 @@
 '''Reads in addresses from DB stored at path, or backup, 
 to label phone numbers.'''
-from helpers.utils import filterDF
+from helpers.utils import filter_based_on_col
 from os.path import expanduser as eu
 import os
 import pandas as pd
@@ -14,7 +14,7 @@
 MO_PTH = '31bb7ba8914766d4ba40d6dfb6113c8b614be442'
 MO_BASE = eu('~/Library/Application Support/MobileSync/Backup/')
 
-def extractContacts(path):
+def extract_contacts(path):
   '''Get Contact Data from PHONENUMBER, RECORD Tables. As in icloud_query.py'''
   try:
     ad_db = sqlite3.connect(path)
@@ -30,60 +30,63 @@ def extractContacts(path):
     print "Non-fatal DB error ON path: ", path
     return {}
 
-def extractBackupContacts(path):
-  '''makes {number: name} dict from iphone backup '''
-  try:
-    db = sqlite3.connect(path)
-    sql = 'SELECT c15Phone, c0First, c1Last, c6Organization from ABPersonFullTextSearch_content'
-    jn = pd.read_sql(sql, db)
 
-    jn = jn.applymap(lambda x: '' if x == None else x)
-    clean = lambda x: filter(lambda y: '0' <= c <= '9', x)[-17:-7]
-    clist = {x[0]: x[1] + ' ' + x[2] for x in 
-            zip(jn.c15Phone.apply(clean), jn.c0First, jn.c1Last)}
-    return clist, True      # always return true?
-  # want better except
-  except Exception:
-    print "Error in extractBackupContacts with path: ", path
-    return {}, False
+def extract_backup_contacts(path):
+    '''makes {number: name} dict from iphone backup '''
+    try:
+        db = sqlite3.connect(path)
+        sql = 'SELECT c15Phone, c0First, c1Last, c6Organization from ABPersonFullTextSearch_content'
+        jn = pd.read_sql(sql, db)
 
+        jn = jn.applymap(lambda x: '' if x == None else x)
+        clean = lambda x: filter(lambda y: '0' <= c <= '9', x)[-17:-7]
+        clist = {x[0]: x[1] + ' ' + x[2] for x in
+                zip(jn.c15Phone.apply(clean), jn.c0First, jn.c1Last)}
+        return clist, True      # always return true?
+    # want better except
+    except Exception:
+        print "Error in extractBackupContacts with path: ", path
+        return {}, False
 
-def groupNames(msg, clist):
-  '''Currently unused attempt to aggregate group chats.'''
-  chats = filterDF(msg, 'chat_id', lambda x: x.startswith('chat'))
-  gb = chats.groupby('chat_id')
-  tmp = gb.id.agg(lambda x: list(set(x)))
-  tmp = dict(zip(tmp.index, tmp.values))
-  def findName(cid):
-    cid = cid.replace('+1','')
-    try:
-      return clist[cid].split(' ')[0]
-    except KeyError:
-      return cid.rstrip()
-  return { k: ','.join([findName(x[1:].lstrip('1')) for x in v]) for k,v in
+
+def name_groupchat(msg, clist):
+    '''Currently unused attempt to aggregate group chats.'''
+    chats = filter_based_on_col(msg, 'chat_id', lambda x: x.startswith('chat'))
+    gb = chats.groupby('chat_id')
+    tmp = gb.id.agg(lambda x: list(set(x)))
+    tmp = dict(zip(tmp.index, tmp.values))
+    def findName(cid):
+        cid = cid.replace('+1','')
+        try:
+            return clist[cid].split(' ')[0]
+        except KeyError:
+            return cid.rstrip()
+    return { k: ','.join([findName(x[1:].lstrip('1')) for x in v]) for k,v in
             tmp.iteritems()}
 
+
 def addresses(msg=[]):
   '''create the {number: name} dictionary from contacts app, or phone backup.'''
   success = False       # so we only call extractBackupContacts once
-  contact_list = extractContacts(COMP_PATH)
+  contact_list = extract_contacts(COMP_PATH)
   paths = filter(lambda x: MO_PTH in x[2], os.walk(MO_BASE))
   backups = [os.path.join(MO_BASE, x[0], MO_PTH) for x in paths]
   if os.path.exists(SRCS):
     backups.extend([os.path.join(SRCS,mid,ENDING) for mid in os.listdir(SRCS)])
   for bu in backups: 
     if not success and MO_PTH in bu:
-      new_cdict, success = extractBackupContacts(bu)
+      new_cdict, success = extract_backup_contacts(bu)
       contact_list.update(new_cdict)
     else:
-        contact_list.update(extractContacts(bu))
+        contact_list.update(extract_contacts(bu))
   if not contact_list:
     print "Contacts: checked", COMP_PATH, backups 
     print "NO CONTACTS FOUND"
   #contact_list = groupNames(msg, contact_list)
   return contact_list
 
-def groupbyContact(msg):
+
+def agg_by_contact(msg):
   '''Group conversations by contact, and calculate summary stats.
     The data that underlies the scatter plot.'''
   msg['snt_chars'] = msg['is_sent'] * msg['msg_len']
diff --git a/app/helpers/utils.py b/app/helpers/utils.py
@@ -1,24 +1,28 @@
 import pandas as pd
+import datetime as dt
 
-def msgLen(text):
-  return len(text) if text else 0
-
-def filterDF(df, col, bool_func):
-  df['keep'] = df[col].apply(bool_func)
-  return df[df.keep].drop('keep', 1)
-
-def checkSavedData():
-  '''Get saved data if it exists.''' 
-  keep = ['ROWID_x','text','tstamp','chat_id','is_sent','cname']
-  return pd.read_csv('msg.csv')[keep] if os.path.exists('msg.csv') else []
-
-def concatSaved(msg, saved_data):
-  '''Adds saved data to extracted data, if saved data exists.'''
-  str2date = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
-  saved_data['tstamp'] = saved_data.tstamp.apply(str2date) 
-  if 'msg_len' not in saved_data.columns:
-    saved_data['msg_len'] =  saved_data.text.fillna(0).apply(msgLen)
-  cutoff = saved_data.tstamp.max()
-  return pd.concat([saved_data, msg[msg.tstamp > cutoff]])
+
+def safe_msg_len(text):
+    return len(text) if text else 0
+
+
+def filter_based_on_col(df, col, bool_func):
+    return df.loc[df[col].apply(bool_func)]
+
+
+def check_saved_data():
+    '''Get saved data if it exists.'''
+    keep = ['ROWID_x','text','tstamp','chat_id','is_sent','cname']
+    return pd.read_csv('msg.csv')[keep] if os.path.exists('msg.csv') else []
+
+
+def concat_saved_data(msg, saved_data):
+    '''Adds saved data to extracted data, if saved data exists.'''
+    str2date = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
+    saved_data['tstamp'] = saved_data.tstamp.apply(str2date)
+    if 'msg_len' not in saved_data.columns:
+      saved_data['msg_len'] =  saved_data.text.fillna(0).apply(safe_msg_len)
+    cutoff = saved_data.tstamp.max()
+    return pd.concat([saved_data, msg[msg.tstamp > cutoff]])
 
 
diff --git a/app/time_chart.py b/app/time_chart.py
@@ -3,7 +3,7 @@
 # and the y axis is characters exchanged
 import numpy as np
 import pandas as pd
-from helpers.utils import filterDF
+from helpers.utils import filter_based_on_col
 from types import *
 
 def getSumStats(gb):
@@ -52,7 +52,7 @@ def timePanel(msg, besties=False, topn=10):
   assert isinstance(msg, pd.DataFrame)
   ts =  byDate(msg, byContact=True)
   if not besties: besties = topN(ts, topn)
-  ts = filterDF(ts,  'cname', lambda x: x in besties)[['cname','ymd','msg_len']]
+  ts = filter_based_on_col(ts, 'cname', lambda x: x in besties)[['cname', 'ymd', 'msg_len']]
   ts.columns = ['key','date','value']
   datestr = lambda x: str(x.date())
   full_range = map(datestr, pd.date_range(ts.date.min(), ts.date.max()))
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,2 @@
-numpy=1.9
-scipy
-pandas
+pandas==0.19.0
+numpy