Skip to content

Commit 2ee2a33

Browse files
author
root
committed
support weighted examples
fix the loss reduction from weighted ranking
1 parent 834c9b1 commit 2ee2a33

10 files changed

+187
-103
lines changed

Config.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) {
4444
cmpIdx_ = (it != cfg.items().end())
4545
? columnIdx[it->second.asString()] : -1;
4646

47+
it = cfg.find("weight_column");
48+
weightIdx_ = (it != cfg.items().end())
49+
? columnIdx[it->second.asString()] : -1;
50+
4751
it = cfg.find("loss_function");
4852
if (it != cfg.items().end() && it->second.asString() == "logistic") {
4953
lossFunction_ = L2Logistic;
5054
} else {
5155
lossFunction_ = L2Regression;
5256
}
53-
57+
5458
const dynamic& trainColumns = cfg["train_columns"];
5559
for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) {
5660
trainIdx_.push_back(columnIdx.at(it->asString()));

Config.h

+8-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ enum LossFunction {
1010
L2Regression = 0,
1111
L2Logistic = 1
1212
};
13-
13+
1414
// Specifying the training parameters and data format
1515
struct Config {
1616

@@ -46,6 +46,10 @@ struct Config {
4646
return targetIdx_;
4747
}
4848

49+
int getWeightIdx() const {
50+
return weightIdx_;
51+
}
52+
4953
int getCompareIdx() const {
5054
return cmpIdx_;
5155
}
@@ -93,8 +97,10 @@ struct Config {
9397

9498
int targetIdx_;
9599
int cmpIdx_;
100+
int weightIdx_;
101+
96102
LossFunction lossFunction_;
97-
103+
98104
std::vector<int> trainIdx_;
99105
std::vector<int> weakIdx_;
100106
std::vector<int> evalIdx_;

DataSet.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)
2525
features_[i].fvec.reset(new vector<double>());
2626
features_[i].encoding = DOUBLE;
2727
}
28+
29+
if (cfg_.getWeightIdx() != -1) {
30+
weights_.reset(new vector<double>());
31+
}
2832
}
2933

3034
bool DataSet::getEvalColumns(const std::string& line,
@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,
4145

4246
bool DataSet::getRow(const string& line, double* target,
4347
boost::scoped_array<double>& fvec,
48+
double* weight,
4449
double* cmpValue) const {
4550
try {
4651
vector<folly::StringPiece> sv;
@@ -64,7 +69,11 @@ bool DataSet::getRow(const string& line, double* target,
6469
if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {
6570
*cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());
6671
}
67-
72+
if (cfg_.getWeightIdx() != -1 && weight != NULL) {
73+
*weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());
74+
} else {
75+
*weight = 1.0;
76+
}
6877
} catch (...) {
6978
LOG(ERROR) << "fail to process line: " << line;
7079
return false;
@@ -100,7 +109,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {
100109
}
101110

102111
bool DataSet::addVector(const boost::scoped_array<double>& fvec,
103-
double target) {
112+
double target, double weight) {
104113
if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {
105114
return false;
106115
}
@@ -128,6 +137,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,
128137
}
129138
}
130139
}
140+
if (weights_) {
141+
weights_->push_back(weight);
142+
}
143+
131144
targets_.push_back(target);
132145
numExamples_++;
133146

DataSet.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,14 @@ class DataSet {
4949
public:
5050
DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1);
5151

52-
bool addVector(const boost::scoped_array<double>& fvec, double target);
52+
bool addVector(const boost::scoped_array<double>& fvec,
53+
double target, double weight);
5354

5455
bool getRow(const std::string& line,
5556
double* target,
5657
boost::scoped_array<double>& fvec,
57-
double* cmpValue = NULL) const;
58+
double* weight,
59+
double* cmpValue) const;
5860

5961
bool getEvalColumns(const std::string& line,
6062
boost::scoped_array<std::string>& feval) const;
@@ -63,6 +65,10 @@ class DataSet {
6365
return numExamples_;
6466
}
6567

68+
const std::unique_ptr<std::vector<double>>& getWeights() const {
69+
return weights_;
70+
}
71+
6672
void getFeatureVec(const int eid, boost::scoped_array<uint16_t>& fvec) const {
6773
for (int i = 0; i < numFeatures_; i++) {
6874
if (features_[i].encoding == EMPTY) {
@@ -103,6 +109,7 @@ class DataSet {
103109

104110
boost::scoped_array<FeatureData> features_;
105111
std::vector<double> targets_;
112+
std::unique_ptr<std::vector<double>> weights_;
106113

107114
friend class TreeRegressor;
108115
friend class Gbm;
@@ -126,4 +133,3 @@ template<class T> void split(const std::vector<int>& subset,
126133
}
127134

128135
}
129-

Gbm.cpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {
4949
//double score = weakModel_->eval(fvec);
5050
double score = ds_.getPrediction(weakModel_.get(), i);
5151
F_[i] += score;
52-
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);
52+
const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
53+
subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);
5354
}
5455
}
5556
monitor_.decrement();
@@ -78,22 +79,22 @@ void Gbm::getModel(
7879
boost::scoped_array<double> F(new double[numExamples]);
7980
boost::scoped_array<double> y(new double[numExamples]);
8081

81-
double f0 = fun_.getF0(ds_.targets_);
82+
double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());
8283
for (int i = 0; i < numExamples; i++) {
8384
F[i] = f0;
8485
}
8586

8687
model->push_back(new LeafNode<double>(f0));
8788

88-
double initLoss = fun_.getInitLoss(ds_.targets_);
89+
double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());
8990

9091
LOG(INFO) << "init avg loss " << initLoss / numExamples;
9192

9293
for (int it = 0; it < cfg_.getNumTrees(); it++) {
9394

9495
LOG(INFO) << "------- iteration " << it << " -------";
9596

96-
fun_.getGradient(ds_.targets_, F, y);
97+
fun_.getGradient(ds_.targets_, F, y, ds_.getWeights().get());
9798
TreeRegressor regressor(ds_, y, fun_);
9899

99100
std::unique_ptr<TreeNode<uint16_t>> weakModel(
@@ -131,7 +132,8 @@ void Gbm::getModel(
131132
// double score = weakModel->eval(fvec);
132133
double score = ds_.getPrediction(weakModel.get(), i);
133134
F[i] += score;
134-
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);
135+
const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
136+
newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);
135137
}
136138
}
137139

GbmFun.h

+51-30
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <boost/scoped_array.hpp>
44
#include <vector>
5+
#include "glog/logging.h"
56

67
namespace boosting {
78

@@ -12,19 +13,23 @@ namespace boosting {
1213
class GbmFun {
1314
public:
1415
virtual double getLeafVal(const std::vector<int>& subset,
15-
const boost::scoped_array<double>& y) const = 0;
16+
const boost::scoped_array<double>& y,
17+
const std::vector<double>* wts = NULL) const = 0;
1618

17-
virtual double getF0(const std::vector<double>& y) const = 0;
19+
virtual double getF0(const std::vector<double>& y,
20+
const std::vector<double>* wts = NULL) const = 0;
1821

1922
virtual void getGradient(const std::vector<double>& y,
2023
const boost::scoped_array<double>& F,
21-
boost::scoped_array<double>& grad) const = 0;
24+
boost::scoped_array<double>& grad,
25+
const std::vector<double>* wts = NULL) const = 0;
2226

23-
virtual double getInitLoss(const std::vector<double>& y) const = 0;
27+
virtual double getInitLoss(const std::vector<double>& y,
28+
const std::vector<double>* wts = NULL) const = 0;
2429

25-
virtual double getExampleLoss(const double y, const double f) const = 0;
30+
virtual double getExampleLoss(const double y, const double f, const double w) const = 0;
2631

27-
virtual void accumulateExampleLoss(const double y, const double f) = 0;
32+
virtual void accumulateExampleLoss(const double y, const double f, const double w) = 0;
2833

2934
virtual double getReduction() const = 0;
3035

@@ -36,30 +41,36 @@ class GbmFun {
3641

3742
class LeastSquareFun : public GbmFun {
3843
public:
39-
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) {
44+
LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) {
4045
}
4146

4247
double getLeafVal(const std::vector<int>& subset,
43-
const boost::scoped_array<double>& y) const {
44-
45-
double sum = 0;
48+
const boost::scoped_array<double>& y, const std::vector<double>* wts = NULL) const {
49+
double sumwy = 0;
50+
double sumw = 0;
4651
for (const auto& id : subset) {
47-
sum += y[id];
52+
double w = ((wts != NULL) ? (*wts)[id] : 1.0);
53+
sumw += w;
54+
sumwy += w * y[id];
4855
}
49-
return sum/subset.size();
56+
return sumwy/sumw;
5057
}
5158

52-
double getF0(const std::vector<double>& yvec) const {
53-
double sum = 0.0;
54-
for (const auto& y : yvec) {
55-
sum += y;
59+
double getF0(const std::vector<double>& yvec, const std::vector<double>* wts = NULL) const {
60+
double sumwy = 0;
61+
double sumw = 0;
62+
for (int i = 0; i < yvec.size(); i++) {
63+
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
64+
sumw += w;
65+
sumwy += w * yvec[i];
5666
}
57-
return sum/yvec.size();
67+
return sumwy/sumw;
5868
}
5969

6070
void getGradient(const std::vector<double>& y,
6171
const boost::scoped_array<double>& F,
62-
boost::scoped_array<double>& grad) const {
72+
boost::scoped_array<double>& grad,
73+
const std::vector<double>* wts = NULL) const {
6374

6475
int size = y.size();
6576

@@ -68,31 +79,40 @@ class LeastSquareFun : public GbmFun {
6879
}
6980
}
7081

71-
double getInitLoss(const std::vector<double>& yvec) const {
82+
double getInitLoss(const std::vector<double>& yvec,
83+
const std::vector<double>* wts = NULL) const {
84+
7285
double sumy = 0.0;
7386
double sumy2 = 0.0;
87+
double sumw = 0.0;
7488

75-
for (const auto& y : yvec) {
76-
sumy += y;
77-
sumy2 += y*y;
89+
for (int i = 0; i < yvec.size(); i++) {
90+
double w = ((wts != NULL) ? (*wts)[i] : 1.0);
91+
double y = yvec[i];
92+
93+
sumw += w;
94+
sumy += w*y;
95+
sumy2 += w*y*y;
7896
}
7997

80-
return sumy2 - sumy * sumy/yvec.size();
98+
return sumy2 - sumy * sumy/sumw;
8199
}
82100

83-
double getExampleLoss(const double y, const double f) const {
84-
return (y - f) * (y - f);
101+
double getExampleLoss(const double y, const double f, const double w) const {
102+
return w * (y - f) * (y - f);
85103
}
86104

87-
void accumulateExampleLoss(const double y, const double f) {
88-
sumy_ += y;
105+
void accumulateExampleLoss(const double y, const double f, const double w) {
106+
sumy_ += w * y;
89107
numExamples_ += 1;
90-
sumy2_ += y * y;
91-
l2_ += getExampleLoss(y, f);
108+
sumw_ += w;
109+
sumy2_ += w * y * y;
110+
111+
l2_ += getExampleLoss(y, f, w);
92112
}
93113

94114
double getReduction() const {
95-
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_);
115+
return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_);
96116
}
97117

98118
int getNumExamples() const {
@@ -108,6 +128,7 @@ class LeastSquareFun : public GbmFun {
108128
double sumy_;
109129
double sumy2_;
110130
double l2_;
131+
double sumw_;
111132
};
112133

113134
}

0 commit comments

Comments
 (0)