Skip to content

Commit 4074585

Browse files
committed
feature sequencing
add sequence-to-sequence input and output features
1 parent d83b117 commit 4074585

10 files changed

+213
-54
lines changed

alphapy/__main__.py

+25-15
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from alphapy.features import remove_lv_features
4040
from alphapy.features import save_features
4141
from alphapy.features import select_features
42+
from alphapy.frame import write_frame
4243
from alphapy.globals import CSEP, PSEP, SSEP, USEP
4344
from alphapy.globals import ModelType
4445
from alphapy.globals import Partition, datasets
@@ -59,6 +60,7 @@
5960
from alphapy.optimize import rfe_search
6061
from alphapy.optimize import rfecv_search
6162
from alphapy.plots import generate_plots
63+
from alphapy.utilities import get_datestamp
6264
from alphapy.utilities import np_store_data
6365

6466
import argparse
@@ -106,14 +108,17 @@ def training_pipeline(model):
106108
# Unpack the model specifications
107109

108110
calibration = model.specs['calibration']
111+
directory = model.specs['directory']
109112
drop = model.specs['drop']
113+
extension = model.specs['extension']
110114
feature_selection = model.specs['feature_selection']
111115
grid_search = model.specs['grid_search']
112116
model_type = model.specs['model_type']
113117
predict_mode = model.specs['predict_mode']
114118
rfe = model.specs['rfe']
115119
sampling = model.specs['sampling']
116120
scorer = model.specs['scorer']
121+
separator = model.specs['separator']
117122
target = model.specs['target']
118123

119124
# Get train and test data
@@ -128,13 +133,6 @@ def training_pipeline(model):
128133
model.test_labels = True
129134
model = save_features(model, X_train, X_test, y_train, y_test)
130135

131-
# Drop features
132-
133-
logger.info("Dropping Features: %s", drop)
134-
X_train = drop_features(X_train, drop)
135-
X_test = drop_features(X_test, drop)
136-
model = save_features(model, X_train, X_test)
137-
138136
# Log feature statistics
139137

140138
logger.info("Original Feature Statistics")
@@ -161,10 +159,24 @@ def training_pipeline(model):
161159
(X_train.shape[1], X_test.shape[1]))
162160

163161
# Apply treatments to the feature matrix
164-
165162
all_features = apply_treatments(model, X)
166-
X_train, X_test = np.array_split(all_features, [split_point])
167-
model = save_features(model, X_train, X_test)
163+
164+
# Drop features
165+
all_features = drop_features(all_features, drop)
166+
167+
# Save the train and test files with extracted and dropped features
168+
169+
datestamp = get_datestamp()
170+
data_dir = SSEP.join([directory, 'input'])
171+
df_train = all_features.iloc[:split_point, :]
172+
df_train = pd.concat([df_train, pd.DataFrame(y_train, columns=[target])], axis=1)
173+
output_file = USEP.join([model.train_file, datestamp])
174+
write_frame(df_train, data_dir, output_file, extension, separator)
175+
df_test = all_features.iloc[split_point:, :]
176+
if y_test.any():
177+
df_test = pd.concat([df_test, pd.DataFrame(y_test, columns=[target])], axis=1)
178+
output_file = USEP.join([model.test_file, datestamp])
179+
write_frame(df_test, data_dir, output_file, extension, separator)
168180

169181
# Create crosstabs for any categorical features
170182

@@ -315,11 +327,6 @@ def prediction_pipeline(model):
315327
# Load feature_map
316328
model = load_feature_map(model, directory)
317329

318-
# Drop features
319-
320-
logger.info("Dropping Features: %s", drop)
321-
X_predict = drop_features(X_predict, drop)
322-
323330
# Log feature statistics
324331

325332
logger.info("Feature Statistics")
@@ -329,6 +336,9 @@ def prediction_pipeline(model):
329336
# Apply treatments to the feature matrix
330337
all_features = apply_treatments(model, X_predict)
331338

339+
# Drop features
340+
all_features = drop_features(all_features, drop)
341+
332342
# Create initial features
333343
all_features = create_features(model, all_features)
334344

alphapy/analysis.py

+32-11
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@
2828

2929
from alphapy.__main__ import main_pipeline
3030
from alphapy.frame import load_frames
31+
from alphapy.frame import sequence_frame
3132
from alphapy.frame import write_frame
32-
from alphapy.globals import SSEP, USEP
33+
from alphapy.globals import SSEP, TAG_ID, USEP
3334
from alphapy.utilities import subtract_days
3435

3536
from datetime import timedelta
@@ -133,7 +134,7 @@ def __str__(self):
133134
# Function run_analysis
134135
#
135136

136-
def run_analysis(analysis, forecast_period, leaders,
137+
def run_analysis(analysis, lag_period, forecast_period, leaders,
137138
predict_history, splits=True):
138139
r"""Run an analysis for a given model and group.
139140
@@ -147,10 +148,14 @@ def run_analysis(analysis, forecast_period, leaders,
147148
----------
148149
analysis : alphapy.Analysis
149150
The analysis to run.
151+
lag_period : int
152+
The number of lagged features for the analysis.
150153
forecast_period : int
151154
The period for forecasting the target of the analysis.
152155
leaders : list
153156
The features that are contemporaneous with the target.
157+
predict_history : int
158+
The number of periods required for lookback calculations.
154159
splits : bool, optional
155160
If ``True``, then the data for each member of the analysis
156161
group are in separate files.
@@ -185,7 +190,11 @@ def run_analysis(analysis, forecast_period, leaders,
185190
train_date = model.specs['train_date']
186191

187192
# Calculate split date
193+
logger.info("Analysis Dates")
188194
split_date = subtract_days(predict_date, predict_history)
195+
logger.info("Train Date: %s", train_date)
196+
logger.info("Split Date: %s", split_date)
197+
logger.info("Test Date: %s", predict_date)
189198

190199
# Load the data frames
191200
data_frames = load_frames(group, directory, extension, separator, splits)
@@ -203,20 +212,24 @@ def run_analysis(analysis, forecast_period, leaders,
203212
# Subset each individual frame and add to the master frame
204213

205214
for df in data_frames:
215+
try:
216+
tag = df[TAG_ID].unique()[0]
217+
except:
218+
tag = 'Unknown'
219+
first_date = df.index[0]
206220
last_date = df.index[-1]
207-
# shift the target for the forecast period
208-
if forecast_period > 0:
209-
df[target] = df[target].shift(-forecast_period)
210-
# shift any leading features if necessary
211-
if leaders:
212-
df[leaders] = df[leaders].shift(-1)
221+
logger.info("Analyzing %s from %s to %s", tag, first_date, last_date)
222+
# sequence leaders, laggards, and target(s)
223+
df = sequence_frame(df, target, leaders, lag_period, forecast_period,
224+
exclude_cols=[TAG_ID])
213225
# get frame subsets
214226
if predict_mode:
215227
new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)]
216228
if len(new_predict) > 0:
217229
predict_frame = predict_frame.append(new_predict)
218230
else:
219-
logger.info("A prediction frame has zero rows. Check prediction date.")
231+
logger.info("Prediction frame %s has zero rows. Check prediction date.",
232+
tag)
220233
else:
221234
# split data into train and test
222235
new_train = df.loc[(df.index >= train_date) & (df.index < split_date)]
@@ -225,12 +238,20 @@ def run_analysis(analysis, forecast_period, leaders,
225238
train_frame = train_frame.append(new_train)
226239
new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)]
227240
if len(new_test) > 0:
241+
# check if target column has NaN values
242+
nan_count = df[target].isnull().sum()
243+
forecast_check = forecast_period - 1
244+
if nan_count != forecast_check:
245+
logger.info("%s has %d records with NaN targets", tag, nan_count)
246+
# drop records with NaN values in target column
228247
new_test = new_test.dropna(subset=[target])
248+
# append selected records to the test frame
229249
test_frame = test_frame.append(new_test)
230250
else:
231-
logger.info("A testing frame has zero rows. Check prediction date.")
251+
logger.info("Testing frame %s has zero rows. Check prediction date.",
252+
tag)
232253
else:
233-
logger.warning("A training frame has zero rows. Check data source.")
254+
logger.info("Training frame %s has zero rows. Check data source.", tag)
234255

235256
# Write out the frames for input into the AlphaPy pipeline
236257

alphapy/data.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -420,12 +420,12 @@ def get_feed_data(group, lookback_period):
420420
group : alphapy.Group
421421
The group of symbols.
422422
lookback_period : int
423-
The number of days of data to retrieve.
423+
The number of periods of data to retrieve.
424424
425425
Returns
426426
-------
427-
daily_data : bool
428-
``True`` if daily data
427+
n_periods : int
428+
The maximum number of periods actually retrieved.
429429
430430
"""
431431

@@ -442,6 +442,7 @@ def get_feed_data(group, lookback_period):
442442
logger.info("Getting Intraday Data (Google 50-day limit)")
443443
daily_data = False
444444
# Get the data from the relevant feed
445+
n_periods = 0
445446
for item in group.members:
446447
logger.info("Getting %s data for last %d days", item, lookback_period)
447448
if daily_data:
@@ -453,7 +454,11 @@ def get_feed_data(group, lookback_period):
453454
newf = Frame(item.lower(), gspace, df)
454455
if newf is None:
455456
logger.error("Could not allocate Frame for: %s", item)
457+
# calculate maximum number of periods
458+
df_len = len(df)
459+
if df_len > n_periods:
460+
n_periods = df_len
456461
else:
457462
logger.info("No DataFrame for %s", item)
458-
# Indicate whether or not data is daily
459-
return daily_data
463+
# The number of periods actually retrieved
464+
return n_periods

alphapy/features.py

+42-11
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,21 @@
2626
# Imports
2727
#
2828

29-
from alphapy.globals import BSEP, NULLTEXT, PSEP, SSEP, USEP
29+
from alphapy.globals import BSEP, LOFF, NULLTEXT
30+
from alphapy.globals import PSEP, SSEP, USEP
3031
from alphapy.globals import Encoders
3132
from alphapy.globals import ModelType
3233
from alphapy.globals import Scalers
3334
from alphapy.market_variables import Variable
35+
from alphapy.market_variables import vparse
3436

3537
import category_encoders as ce
3638
from importlib import import_module
3739
from itertools import groupby
3840
import logging
3941
import math
4042
import numpy as np
43+
import os
4144
import pandas as pd
4245
import re
4346
from scipy import sparse
@@ -61,6 +64,7 @@
6164
from sklearn.preprocessing import MinMaxScaler
6265
from sklearn.preprocessing import PolynomialFeatures
6366
from sklearn.preprocessing import StandardScaler
67+
import sys
6468

6569

6670
#
@@ -424,6 +428,8 @@ def apply_treatment(fname, df, fparams):
424428
module = fparams[0]
425429
func_name = fparams[1]
426430
plist = fparams[2:]
431+
# Append to system path
432+
sys.path.append(os.getcwd())
427433
# Import the external treatment function
428434
ext_module = import_module(module)
429435
func = getattr(ext_module, func_name)
@@ -476,17 +482,34 @@ def apply_treatments(model, X):
476482
logger.info("Applying Treatments")
477483
all_features = X
478484

479-
for fname in X:
480-
if treatments and fname in treatments:
481-
features = apply_treatment(fname, X, treatments[fname])
482-
if features is not None:
483-
if features.shape[0] == X.shape[0]:
484-
all_features = pd.concat([all_features, features], axis=1)
485+
if treatments:
486+
for fname in treatments:
487+
# find feature series
488+
fcols = []
489+
for col in X.columns:
490+
if col.split(LOFF)[0] == fname:
491+
fcols.append(col)
492+
# get lag values
493+
lag_values = []
494+
for item in fcols:
495+
_, _, _, lag = vparse(item)
496+
lag_values.append(lag)
497+
# apply treatment to the most recent value
498+
if lag_values:
499+
f_latest = fcols[lag_values.index(min(lag_values))]
500+
features = apply_treatment(f_latest, X, treatments[fname])
501+
if features is not None:
502+
if features.shape[0] == X.shape[0]:
503+
all_features = pd.concat([all_features, features], axis=1)
504+
else:
505+
raise IndexError("The number of treatment rows [%d] must match X [%d]" %
506+
(features.shape[0], X.shape[0]))
485507
else:
486-
raise IndexError("The number of treatment rows [%d] must match X [%d]" %
487-
(features.shape[0], X.shape[0]))
508+
logger.info("Could not apply treatment for feature %s", fname)
488509
else:
489-
logger.info("Could not apply treatment for feature %s", fname)
510+
logger.info("Feature %s is missing for treatment", fname)
511+
else:
512+
logger.info("No Treatments Specified")
490513

491514
logger.info("New Feature Count : %d", all_features.shape[1])
492515

@@ -1575,7 +1598,15 @@ def drop_features(X, drop):
15751598
The dataframe without the dropped features.
15761599
15771600
"""
1578-
X.drop(drop, axis=1, inplace=True, errors='ignore')
1601+
drop_cols = []
1602+
for d in drop:
1603+
for col in X.columns:
1604+
if col.split(LOFF)[0] == d:
1605+
drop_cols.append(col)
1606+
logger.info("Dropping Features: %s", drop_cols)
1607+
logger.info("Original Feature Count : %d", X.shape[1])
1608+
X.drop(drop_cols, axis=1, inplace=True, errors='ignore')
1609+
logger.info("Reduced Feature Count : %d", X.shape[1])
15791610
return X
15801611

15811612

0 commit comments

Comments
 (0)