Skip to content

Commit a8b23b1

Browse files
committed
Add Quandl and Data Schema
AlphaPy now supports local data schema and Quandl feeds using the pandas Web data reader package.
1 parent e1ac840 commit a8b23b1

19 files changed

+5464
-835
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,7 @@
2424
.idea/vcs.xml
2525

2626
.idea/workspace.xml
27+
.idea/other.xml
28+
alphapy/examples/Trading System/.ipynb_checkpoints/A Trading System-checkpoint.ipynb
29+
*.pkl
30+
*.png

alphapy/data.py

+119-31
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131
from alphapy.frame import read_frame
3232
from alphapy.globals import ModelType
3333
from alphapy.globals import Partition, datasets
34-
from alphapy.globals import PSEP, SSEP
34+
from alphapy.globals import PD_INTRADAY_OFFSETS
35+
from alphapy.globals import PD_WEB_DATA_FEEDS
36+
from alphapy.globals import PSEP, SSEP, USEP
3537
from alphapy.globals import SamplingMethod
3638
from alphapy.globals import WILDCARD
3739

@@ -282,6 +284,53 @@ def sample_data(model):
282284
return model
283285

284286

287+
#
288+
# Function enhance_intraday_data
289+
#
290+
291+
def enhance_intraday_data(df):
292+
r"""Add columns to the intraday dataframe.
293+
294+
Parameters
295+
----------
296+
df : pandas.DataFrame
297+
The intraday dataframe.
298+
299+
Returns
300+
-------
301+
df : pandas.DataFrame
302+
The dataframe with bar number and end-of-day columns.
303+
304+
"""
305+
306+
# Convert the columns to proper data types
307+
308+
index_column = 'datetime'
309+
dt_column = df['date'] + ' ' + df['time']
310+
df[index_column] = pd.to_datetime(dt_column)
311+
cols_float = ['open', 'high', 'low', 'close', 'volume']
312+
df[cols_float] = df[cols_float].astype(float)
313+
314+
# Number the intraday bars
315+
316+
date_group = df.groupby('date')
317+
df['bar_number'] = date_group.cumcount()
318+
319+
# Mark the end of the trading day
320+
321+
df['end_of_day'] = False
322+
df.loc[date_group.tail(1).index, 'end_of_day'] = True
323+
324+
# Set the data frame's index
325+
df.set_index(pd.DatetimeIndex(df[index_column]), drop=True, inplace=True)
326+
327+
# Return the enhanced frame
328+
329+
del df['date']
330+
del df['time']
331+
return df
332+
333+
285334
#
286335
# Function get_google_data
287336
#
@@ -345,25 +394,12 @@ def get_google_data(symbol, lookback_period, fractal):
345394
dt = datetime.fromtimestamp(day_item + (interval * offset))
346395
dt = pd.to_datetime(dt)
347396
dt_date = dt.strftime('%Y-%m-%d')
348-
record = (dt, dt_date, open_item, high_item, low_item, close_item, volume_item)
397+
dt_time = dt.strftime('%H:%M:%S')
398+
record = (dt_date, dt_time, open_item, high_item, low_item, close_item, volume_item)
349399
records.append(record)
350400
# create data frame
351-
cols = ['datetime', 'date', 'open', 'high', 'low', 'close', 'volume']
401+
cols = ['date', 'time', 'open', 'high', 'low', 'close', 'volume']
352402
df = pd.DataFrame.from_records(records, columns=cols)
353-
# convert to proper data types
354-
cols_float = ['open', 'high', 'low', 'close']
355-
df[cols_float] = df[cols_float].astype(float)
356-
df['volume'] = df['volume'].astype(int)
357-
# number the intraday bars
358-
date_group = df.groupby('date')
359-
df['bar_number'] = date_group.cumcount()
360-
# mark the end of the trading day
361-
df['end_of_day'] = False
362-
del df['date']
363-
df.loc[date_group.tail(1).index, 'end_of_day'] = True
364-
# set the index to datetime
365-
df.index = df['datetime']
366-
del df['datetime']
367403
# return the dataframe
368404
return df
369405

@@ -373,7 +409,7 @@ def get_google_data(symbol, lookback_period, fractal):
373409
#
374410

375411
def get_pandas_data(schema, symbol, lookback_period):
376-
r"""Get Yahoo Finance daily data.
412+
r"""Get Pandas Web Reader data.
377413
378414
Parameters
379415
----------
@@ -391,32 +427,39 @@ def get_pandas_data(schema, symbol, lookback_period):
391427
392428
"""
393429

430+
# Quandl is a special case.
431+
432+
if 'quandl' in schema:
433+
schema, symbol_prefix = schema.split(USEP)
434+
symbol = SSEP.join([symbol_prefix, symbol]).upper()
435+
394436
# Calculate the start and end date for Yahoo.
395437

396438
start = datetime.now() - timedelta(lookback_period)
397439
end = datetime.now()
398440

399441
# Call the Pandas Web data reader.
400442

401-
df = None
402443
try:
403444
df = web.DataReader(symbol, schema, start, end)
404-
df = df.rename(columns = lambda x: x.lower().replace(' ',''))
405445
except:
446+
df = None
406447
logger.info("Could not retrieve data for: %s", symbol)
407448

408449
return df
409450

410451

411452
#
412-
# Function get_feed_data
453+
# Function get_market_data
413454
#
414455

415-
def get_feed_data(group, lookback_period):
456+
def get_market_data(model, group, lookback_period, resample_data):
416457
r"""Get data from an external feed.
417458
418459
Parameters
419460
----------
461+
model : alphapy.Model
462+
The model object describing the data.
420463
group : alphapy.Group
421464
The group of symbols.
422465
lookback_period : int
@@ -429,27 +472,71 @@ def get_feed_data(group, lookback_period):
429472
430473
"""
431474

475+
# Unpack model specifications
476+
477+
directory = model.specs['directory']
478+
extension = model.specs['extension']
479+
separator = model.specs['separator']
480+
481+
# Unpack group elements
482+
432483
gspace = group.space
433484
schema = gspace.schema
434485
fractal = gspace.fractal
486+
435487
# Determine the feed source
436-
if 'd' in fractal:
437-
# daily data (date only)
438-
logger.info("Getting Daily Data")
439-
daily_data = True
440-
else:
488+
489+
if any(substring in fractal for substring in PD_INTRADAY_OFFSETS):
441490
# intraday data (date and time)
442-
logger.info("Getting Intraday Data (Google 50-day limit)")
443-
daily_data = False
491+
logger.info("Getting Intraday Data [%s] from %s", fractal, schema)
492+
intraday_data = True
493+
index_column = 'datetime'
494+
else:
495+
# daily data or higher (date only)
496+
logger.info("Getting Daily Data [%s] from %s", fractal, schema)
497+
intraday_data = False
498+
index_column = 'date'
499+
444500
# Get the data from the relevant feed
501+
502+
data_dir = SSEP.join([directory, 'data'])
503+
pandas_data = any(substring in schema for substring in PD_WEB_DATA_FEEDS)
445504
n_periods = 0
505+
446506
for item in group.members:
447507
logger.info("Getting %s data for last %d days", item, lookback_period)
448-
if daily_data:
508+
# Locate the data source
509+
if schema == 'data':
510+
fname = frame_name(item.lower(), gspace)
511+
df = read_frame(data_dir, fname, extension, separator)
512+
if not intraday_data:
513+
df.set_index(pd.DatetimeIndex(df[index_column]),
514+
drop=True, inplace=True)
515+
elif schema == 'google' and intraday_data:
516+
df = get_google_data(item, lookback_period, fractal)
517+
elif pandas_data:
449518
df = get_pandas_data(schema, item, lookback_period)
450519
else:
451-
df = get_google_data(item, lookback_period, fractal)
520+
logger.error("Unsupported Data Source: %s", schema)
521+
# Now that we have content, standardize the data
452522
if df is not None and not df.empty:
523+
logger.info("Rows: %d", len(df))
524+
# standardize column names
525+
df = df.rename(columns = lambda x: x.lower().replace(' ',''))
526+
# add intraday columns if necessary
527+
if intraday_data:
528+
df = enhance_intraday_data(df)
529+
# order by increasing date if necessary
530+
df = df.sort_index()
531+
# resample data
532+
if resample_data:
533+
df = df.resample(fractal).agg({'open' : 'first',
534+
'high' : 'max',
535+
'low' : 'min',
536+
'close' : 'last',
537+
'volume' : 'sum'})
538+
logger.info("Rows after Resampling at %s: %d",
539+
fractal, len(df))
453540
# allocate global Frame
454541
newf = Frame(item.lower(), gspace, df)
455542
if newf is None:
@@ -460,5 +547,6 @@ def get_feed_data(group, lookback_period):
460547
n_periods = df_len
461548
else:
462549
logger.info("No DataFrame for %s", item)
550+
463551
# The number of periods actually retrieved
464552
return n_periods

0 commit comments

Comments
 (0)