Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support weighted examples #12

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) {
cmpIdx_ = (it != cfg.items().end())
? columnIdx[it->second.asString()] : -1;

it = cfg.find("weight_column");
weightIdx_ = (it != cfg.items().end())
? columnIdx[it->second.asString()] : -1;

it = cfg.find("loss_function");
if (it != cfg.items().end() && it->second.asString() == "logistic") {
lossFunction_ = L2Logistic;
} else {
lossFunction_ = L2Regression;
}

const dynamic& trainColumns = cfg["train_columns"];
for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) {
trainIdx_.push_back(columnIdx.at(it->asString()));
Expand Down
10 changes: 8 additions & 2 deletions Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ enum LossFunction {
L2Regression = 0,
L2Logistic = 1
};

// Specifying the training parameters and data format
struct Config {

Expand Down Expand Up @@ -46,6 +46,10 @@ struct Config {
return targetIdx_;
}

int getWeightIdx() const {
return weightIdx_;
}

int getCompareIdx() const {
return cmpIdx_;
}
Expand Down Expand Up @@ -93,8 +97,10 @@ struct Config {

int targetIdx_;
int cmpIdx_;
int weightIdx_;

LossFunction lossFunction_;

std::vector<int> trainIdx_;
std::vector<int> weakIdx_;
std::vector<int> evalIdx_;
Expand Down
17 changes: 15 additions & 2 deletions DataSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)
features_[i].fvec.reset(new vector<double>());
features_[i].encoding = DOUBLE;
}

if (cfg_.getWeightIdx() != -1) {
weights_.reset(new vector<double>());
}
}

bool DataSet::getEvalColumns(const std::string& line,
Expand All @@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,

bool DataSet::getRow(const string& line, double* target,
boost::scoped_array<double>& fvec,
double* weight,
double* cmpValue) const {
try {
vector<folly::StringPiece> sv;
Expand All @@ -64,7 +69,11 @@ bool DataSet::getRow(const string& line, double* target,
if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {
*cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());
}

if (cfg_.getWeightIdx() != -1 && weight != NULL) {
*weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());
} else {
*weight = 1.0;
}
} catch (...) {
LOG(ERROR) << "fail to process line: " << line;
return false;
Expand Down Expand Up @@ -100,7 +109,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {
}

bool DataSet::addVector(const boost::scoped_array<double>& fvec,
double target) {
double target, double weight) {
if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {
return false;
}
Expand Down Expand Up @@ -128,6 +137,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,
}
}
}
if (weights_) {
weights_->push_back(weight);
}

targets_.push_back(target);
numExamples_++;

Expand Down
12 changes: 9 additions & 3 deletions DataSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,14 @@ class DataSet {
public:
DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1);

bool addVector(const boost::scoped_array<double>& fvec, double target);
bool addVector(const boost::scoped_array<double>& fvec,
double target, double weight);

bool getRow(const std::string& line,
double* target,
boost::scoped_array<double>& fvec,
double* cmpValue = NULL) const;
double* weight,
double* cmpValue) const;

bool getEvalColumns(const std::string& line,
boost::scoped_array<std::string>& feval) const;
Expand All @@ -63,6 +65,10 @@ class DataSet {
return numExamples_;
}

const std::unique_ptr<std::vector<double>>& getWeights() const {
return weights_;
}

void getFeatureVec(const int eid, boost::scoped_array<uint16_t>& fvec) const {
for (int i = 0; i < numFeatures_; i++) {
if (features_[i].encoding == EMPTY) {
Expand Down Expand Up @@ -103,6 +109,7 @@ class DataSet {

boost::scoped_array<FeatureData> features_;
std::vector<double> targets_;
std::unique_ptr<std::vector<double>> weights_;

friend class TreeRegressor;
friend class Gbm;
Expand All @@ -126,4 +133,3 @@ template<class T> void split(const std::vector<int>& subset,
}

}

12 changes: 7 additions & 5 deletions Gbm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {
//double score = weakModel_->eval(fvec);
double score = ds_.getPrediction(weakModel_.get(), i);
F_[i] += score;
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);
const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);
}
}
monitor_.decrement();
Expand Down Expand Up @@ -78,22 +79,22 @@ void Gbm::getModel(
boost::scoped_array<double> F(new double[numExamples]);
boost::scoped_array<double> y(new double[numExamples]);

double f0 = fun_.getF0(ds_.targets_);
double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());
for (int i = 0; i < numExamples; i++) {
F[i] = f0;
}

model->push_back(new LeafNode<double>(f0));

double initLoss = fun_.getInitLoss(ds_.targets_);
double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());

LOG(INFO) << "init avg loss " << initLoss / numExamples;

for (int it = 0; it < cfg_.getNumTrees(); it++) {

LOG(INFO) << "------- iteration " << it << " -------";

fun_.getGradient(ds_.targets_, F, y);
fun_.getGradient(ds_.targets_, F, y, ds_.getWeights().get());
TreeRegressor regressor(ds_, y, fun_);

std::unique_ptr<TreeNode<uint16_t>> weakModel(
Expand Down Expand Up @@ -131,7 +132,8 @@ void Gbm::getModel(
// double score = weakModel->eval(fvec);
double score = ds_.getPrediction(weakModel.get(), i);
F[i] += score;
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);
const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);
}
}

Expand Down
81 changes: 51 additions & 30 deletions GbmFun.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <boost/scoped_array.hpp>
#include <vector>
#include "glog/logging.h"

namespace boosting {

Expand All @@ -12,19 +13,23 @@ namespace boosting {
class GbmFun {
public:
virtual double getLeafVal(const std::vector<int>& subset,
const boost::scoped_array<double>& y) const = 0;
const boost::scoped_array<double>& y,
const std::vector<double>* wts = NULL) const = 0;

virtual double getF0(const std::vector<double>& y) const = 0;
virtual double getF0(const std::vector<double>& y,
const std::vector<double>* wts = NULL) const = 0;

virtual void getGradient(const std::vector<double>& y,
const boost::scoped_array<double>& F,
boost::scoped_array<double>& grad) const = 0;
boost::scoped_array<double>& grad,
const std::vector<double>* wts = NULL) const = 0;

virtual double getInitLoss(const std::vector<double>& y) const = 0;
virtual double getInitLoss(const std::vector<double>& y,
const std::vector<double>* wts = NULL) const = 0;

virtual double getExampleLoss(const double y, const double f) const = 0;
virtual double getExampleLoss(const double y, const double f, const double w) const = 0;

virtual void accumulateExampleLoss(const double y, const double f) = 0;
virtual void accumulateExampleLoss(const double y, const double f, const double w) = 0;

virtual double getReduction() const = 0;

Expand All @@ -36,30 +41,36 @@ class GbmFun {

class LeastSquareFun : public GbmFun {
public:
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) {
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) {
}

double getLeafVal(const std::vector<int>& subset,
const boost::scoped_array<double>& y) const {

double sum = 0;
const boost::scoped_array<double>& y, const std::vector<double>* wts = NULL) const {
double sumwy = 0;
double sumw = 0;
for (const auto& id : subset) {
sum += y[id];
double w = ((wts != NULL) ? (*wts)[id] : 1.0);
sumw += w;
sumwy += w * y[id];
}
return sum/subset.size();
return sumwy/sumw;
}

double getF0(const std::vector<double>& yvec) const {
double sum = 0.0;
for (const auto& y : yvec) {
sum += y;
double getF0(const std::vector<double>& yvec, const std::vector<double>* wts = NULL) const {
double sumwy = 0;
double sumw = 0;
for (int i = 0; i < yvec.size(); i++) {
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
sumw += w;
sumwy += w * yvec[i];
}
return sum/yvec.size();
return sumwy/sumw;
}

void getGradient(const std::vector<double>& y,
const boost::scoped_array<double>& F,
boost::scoped_array<double>& grad) const {
boost::scoped_array<double>& grad,
const std::vector<double>* wts = NULL) const {

int size = y.size();

Expand All @@ -68,31 +79,40 @@ class LeastSquareFun : public GbmFun {
}
}

double getInitLoss(const std::vector<double>& yvec) const {
double getInitLoss(const std::vector<double>& yvec,
const std::vector<double>* wts = NULL) const {

double sumy = 0.0;
double sumy2 = 0.0;
double sumw = 0.0;

for (const auto& y : yvec) {
sumy += y;
sumy2 += y*y;
for (int i = 0; i < yvec.size(); i++) {
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
double y = yvec[i];

sumw += w;
sumy += w*y;
sumy2 += w*y*y;
}

return sumy2 - sumy * sumy/yvec.size();
return sumy2 - sumy * sumy/sumw;
}

double getExampleLoss(const double y, const double f) const {
return (y - f) * (y - f);
double getExampleLoss(const double y, const double f, const double w) const {
return w * (y - f) * (y - f);
}

void accumulateExampleLoss(const double y, const double f) {
sumy_ += y;
void accumulateExampleLoss(const double y, const double f, const double w) {
sumy_ += w * y;
numExamples_ += 1;
sumy2_ += y * y;
l2_ += getExampleLoss(y, f);
sumw_ += w;
sumy2_ += w * y * y;

l2_ += getExampleLoss(y, f, w);
}

double getReduction() const {
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_);
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_);
}

int getNumExamples() const {
Expand All @@ -108,6 +128,7 @@ class LeastSquareFun : public GbmFun {
double sumy_;
double sumy2_;
double l2_;
double sumw_;
};

}
Loading