bayesian
diff --git a/‎Config.cpp
+5-1 b/‎Config.cpp
+5-1
diff --git a/‎Config.h
+8-2 b/‎Config.h
+8-2
diff --git a/‎DataSet.cpp
+15-2 b/‎DataSet.cpp
+15-2
diff --git a/‎DataSet.h
+9-3 b/‎DataSet.h
+9-3
diff --git a/‎Gbm.cpp
+7-5 b/‎Gbm.cpp
+7-5
diff --git a/‎GbmFun.h
+51-30 b/‎GbmFun.h
+51-30
@@ -44,13 +44,17 @@ bool Config::readConfig(const std::string& fileName) {
     cmpIdx_ = (it != cfg.items().end())
       ? columnIdx[it->second.asString()] : -1;
 
+    it = cfg.find("weight_column");
+    weightIdx_ = (it != cfg.items().end())
+      ? columnIdx[it->second.asString()] : -1;
+
     it = cfg.find("loss_function");
     if (it != cfg.items().end() && it->second.asString() == "logistic") {
       lossFunction_ = L2Logistic;
     } else {
       lossFunction_ = L2Regression;
     }
-    
+
     const dynamic& trainColumns = cfg["train_columns"];
     for (auto it = trainColumns.begin(); it != trainColumns.end(); ++it) {
       trainIdx_.push_back(columnIdx.at(it->asString()));
 
@@ -10,7 +10,7 @@ enum LossFunction {
   L2Regression = 0,
   L2Logistic   = 1
 };
- 
+
 // Specifying the training parameters and data format
 struct Config {
 
@@ -46,6 +46,10 @@ struct Config {
     return targetIdx_;
   }
 
+  int getWeightIdx() const {
+    return weightIdx_;
+  }
+
   int getCompareIdx() const {
     return cmpIdx_;
   }
@@ -93,8 +97,10 @@ struct Config {
 
   int targetIdx_;
   int cmpIdx_;
+  int weightIdx_;
+
   LossFunction lossFunction_;
-  
+
   std::vector<int> trainIdx_;
   std::vector<int> weakIdx_;
   std::vector<int> evalIdx_;
 
@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)
     features_[i].fvec.reset(new vector<double>());
     features_[i].encoding = DOUBLE;
   }
+
+  if (cfg_.getWeightIdx() != -1) {
+    weights_.reset(new vector<double>());
+  }
 }
 
 bool DataSet::getEvalColumns(const std::string& line,
@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,
 
 bool DataSet::getRow(const string& line, double* target,
                      boost::scoped_array<double>& fvec,
+                     double* weight,
                      double* cmpValue) const {
   try {
     vector<folly::StringPiece> sv;
@@ -64,7 +69,11 @@ bool DataSet::getRow(const string& line, double* target,
     if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {
       *cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());
     }
-
+    if (cfg_.getWeightIdx() != -1 && weight != NULL) {
+      *weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());
+    } else {
+      *weight = 1.0;
+    }
   } catch (...) {
     LOG(ERROR) << "fail to process line: " << line;
     return false;
@@ -100,7 +109,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {
 }
 
 bool DataSet::addVector(const boost::scoped_array<double>& fvec,
-                        double target) {
+                        double target, double weight) {
   if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {
     return false;
   }
@@ -128,6 +137,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,
       }
     }
   }
+  if (weights_) {
+    weights_->push_back(weight);
+  }
+
   targets_.push_back(target);
   numExamples_++;
 
 
@@ -49,12 +49,14 @@ class DataSet {
  public:
   DataSet(const Config& cfg, int bucketingThresh, int examplesThresh=-1);
 
-  bool addVector(const boost::scoped_array<double>& fvec, double target);
+  bool addVector(const boost::scoped_array<double>& fvec,
+                 double target, double weight);
 
   bool getRow(const std::string& line,
               double* target,
               boost::scoped_array<double>& fvec,
-              double* cmpValue = NULL) const;
+              double* weight,
+              double* cmpValue) const;
 
   bool getEvalColumns(const std::string& line,
 		      boost::scoped_array<std::string>& feval) const;
@@ -63,6 +65,10 @@ class DataSet {
     return numExamples_;
   }
 
+  const std::unique_ptr<std::vector<double>>& getWeights() const {
+    return weights_;
+  }
+
   void getFeatureVec(const int eid, boost::scoped_array<uint16_t>& fvec) const {
     for (int i = 0; i < numFeatures_; i++) {
       if (features_[i].encoding == EMPTY) {
@@ -103,6 +109,7 @@ class DataSet {
 
   boost::scoped_array<FeatureData> features_;
   std::vector<double> targets_;
+  std::unique_ptr<std::vector<double>> weights_;
 
   friend class TreeRegressor;
   friend class Gbm;
@@ -126,4 +133,3 @@ template<class T> void split(const std::vector<int>& subset,
 }
 
 }
-
 
@@ -49,7 +49,8 @@ class ParallelEval : public apache::thrift::concurrency::Runnable {
         //double score = weakModel_->eval(fvec);
         double score = ds_.getPrediction(weakModel_.get(), i);
         F_[i] += score;
-        subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i]);
+        const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
+        subLoss_[workIdx_] += fun_.getExampleLoss(targets_[i], F_[i], wt);
       }
     }
     monitor_.decrement();
@@ -78,22 +79,22 @@ void Gbm::getModel(
   boost::scoped_array<double> F(new double[numExamples]);
   boost::scoped_array<double> y(new double[numExamples]);
 
-  double f0 = fun_.getF0(ds_.targets_);
+  double f0 = fun_.getF0(ds_.targets_, ds_.getWeights().get());
   for (int i = 0; i < numExamples; i++) {
     F[i] = f0;
   }
 
   model->push_back(new LeafNode<double>(f0));
 
-  double initLoss = fun_.getInitLoss(ds_.targets_);
+  double initLoss = fun_.getInitLoss(ds_.targets_, ds_.getWeights().get());
 
   LOG(INFO) << "init avg loss " << initLoss / numExamples;
 
   for (int it = 0; it < cfg_.getNumTrees(); it++) {
 
     LOG(INFO) << "------- iteration " << it << " -------";
 
-    fun_.getGradient(ds_.targets_, F, y);
+    fun_.getGradient(ds_.targets_, F, y, ds_.getWeights().get());
     TreeRegressor regressor(ds_, y, fun_);
 
     std::unique_ptr<TreeNode<uint16_t>> weakModel(
@@ -131,7 +132,8 @@ void Gbm::getModel(
         // double score = weakModel->eval(fvec);
         double score = ds_.getPrediction(weakModel.get(), i);
         F[i] += score;
-        newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i]);
+        const double wt = ds_.getWeights() ? (*(ds_.getWeights()))[i] : 1.0;
+        newLoss += fun_.getExampleLoss(ds_.targets_[i], F[i], wt);
       }
     }
 
 
@@ -2,6 +2,7 @@
 
 #include <boost/scoped_array.hpp>
 #include <vector>
+#include "glog/logging.h"
 
 namespace boosting {
 
@@ -12,19 +13,23 @@ namespace boosting {
 class GbmFun {
  public:
   virtual double getLeafVal(const std::vector<int>& subset,
-                            const boost::scoped_array<double>& y) const = 0;
+                            const boost::scoped_array<double>& y,
+                            const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getF0(const std::vector<double>& y) const = 0;
+  virtual double getF0(const std::vector<double>& y,
+                       const std::vector<double>* wts = NULL) const = 0;
 
   virtual void getGradient(const std::vector<double>& y,
                            const boost::scoped_array<double>& F,
-                           boost::scoped_array<double>& grad) const = 0;
+                           boost::scoped_array<double>& grad,
+                           const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getInitLoss(const std::vector<double>& y) const = 0;
+  virtual double getInitLoss(const std::vector<double>& y,
+                             const std::vector<double>* wts = NULL) const = 0;
 
-  virtual double getExampleLoss(const double y, const double f) const = 0;
+  virtual double getExampleLoss(const double y, const double f, const double w) const = 0;
 
-  virtual void accumulateExampleLoss(const double y, const double f) = 0;
+  virtual void accumulateExampleLoss(const double y, const double f, const double w) = 0;
 
   virtual double getReduction() const = 0;
 
@@ -36,30 +41,36 @@ class GbmFun {
 
 class LeastSquareFun : public GbmFun {
  public:
-  LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0) {
+  LeastSquareFun() : numExamples_(0), sumy_(0.0), sumy2_(0.0), l2_(0.0), sumw_(0.0) {
   }
 
   double getLeafVal(const std::vector<int>& subset,
-                    const boost::scoped_array<double>& y) const {
-
-    double sum = 0;
+                    const boost::scoped_array<double>& y, const std::vector<double>* wts = NULL) const {
+    double sumwy = 0;
+    double sumw = 0;
     for (const auto& id : subset) {
-      sum += y[id];
+      double w = ((wts != NULL) ? (*wts)[id] : 1.0);
+      sumw += w;
+      sumwy += w * y[id];
     }
-    return sum/subset.size();
+    return sumwy/sumw;
   }
 
-  double getF0(const std::vector<double>& yvec) const {
-    double sum = 0.0;
-    for (const auto& y : yvec) {
-      sum += y;
+  double getF0(const std::vector<double>& yvec, const std::vector<double>* wts = NULL) const {
+    double sumwy = 0;
+    double sumw = 0;
+    for (int i = 0; i < yvec.size(); i++) {
+      double w = ((wts != NULL) ? (*wts)[i] : 1.0);
+      sumw += w;
+      sumwy += w * yvec[i];
     }
-    return sum/yvec.size();
+    return sumwy/sumw;
   }
 
   void getGradient(const std::vector<double>& y,
                    const boost::scoped_array<double>& F,
-                   boost::scoped_array<double>& grad) const {
+                   boost::scoped_array<double>& grad,
+                   const std::vector<double>* wts = NULL) const {
 
     int size = y.size();
 
@@ -68,31 +79,40 @@ class LeastSquareFun : public GbmFun {
     }
   }
 
-  double getInitLoss(const std::vector<double>& yvec) const {
+  double getInitLoss(const std::vector<double>& yvec,
+                     const std::vector<double>* wts = NULL) const {
+
     double sumy = 0.0;
     double sumy2 = 0.0;
+    double sumw = 0.0;
 
-    for (const auto& y : yvec) {
-      sumy += y;
-      sumy2 += y*y;
+    for (int i = 0; i < yvec.size(); i++) {
+      double w = ((wts != NULL) ? (*wts)[i] : 1.0);
+      double y = yvec[i];
+
+      sumw += w;
+      sumy += w*y;
+      sumy2 += w*y*y;
     }
 
-    return sumy2 - sumy * sumy/yvec.size();
+    return sumy2 - sumy * sumy/sumw;
   }
 
-  double getExampleLoss(const double y, const double f) const {
-    return (y - f) * (y - f);
+  double getExampleLoss(const double y, const double f, const double w) const {
+    return w * (y - f) * (y - f);
   }
 
-  void accumulateExampleLoss(const double y, const double f) {
-    sumy_ += y;
+  void accumulateExampleLoss(const double y, const double f, const double w) {
+    sumy_ += w * y;
     numExamples_ += 1;
-    sumy2_ += y * y;
-    l2_ += getExampleLoss(y, f);
+    sumw_ += w;
+    sumy2_ += w * y * y;
+
+    l2_ += getExampleLoss(y, f, w);
   }
 
   double getReduction() const {
-    return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/numExamples_);
+    return 1.0 - l2_/(sumy2_ - sumy_ * sumy_/sumw_);
   }
 
   int getNumExamples() const {
@@ -108,6 +128,7 @@ class LeastSquareFun : public GbmFun {
   double sumy_;
   double sumy2_;
   double l2_;
+  double sumw_;
 };
 
 }
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,10 @@ DataSet::DataSet(const Config& cfg, int bucketingThresh, int examplesThresh)`
`25`	`25`	`features_[i].fvec.reset(new vector<double>());`
`26`	`26`	`features_[i].encoding = DOUBLE;`
`27`	`27`	`}`
	`28`	`+`
	`29`	`+ if (cfg_.getWeightIdx() != -1) {`
	`30`	`+ weights_.reset(new vector<double>());`
	`31`	`+ }`
`28`	`32`	`}`
`29`	`33`
`30`	`34`	`bool DataSet::getEvalColumns(const std::string& line,`
`@@ -41,6 +45,7 @@ bool DataSet::getEvalColumns(const std::string& line,`
`41`	`45`
`42`	`46`	`bool DataSet::getRow(const string& line, double* target,`
`43`	`47`	`boost::scoped_array<double>& fvec,`
	`48`	`+ double* weight,`
`44`	`49`	`double* cmpValue) const {`
`45`	`50`	`try {`
`46`	`51`	`vector<folly::StringPiece> sv;`
`@@ -64,7 +69,11 @@ bool DataSet::getRow(const string& line, double* target,`
`64`	`69`	`if (cfg_.getCompareIdx() != -1 && cmpValue != NULL) {`
`65`	`70`	`*cmpValue = atof(sv[cfg_.getCompareIdx()].toString().c_str());`
`66`	`71`	`}`
`67`		`-`
	`72`	`+ if (cfg_.getWeightIdx() != -1 && weight != NULL) {`
	`73`	`+ *weight = atof(sv[cfg_.getWeightIdx()].toString().c_str());`
	`74`	`+ } else {`
	`75`	`+ *weight = 1.0;`
	`76`	`+ }`
`68`	`77`	`} catch (...) {`
`69`	`78`	`LOG(ERROR) << "fail to process line: " << line;`
`70`	`79`	`return false;`
`@@ -100,7 +109,7 @@ double DataSet::getPrediction(TreeNode<uint16_t>* rt, int eid) const {`
`100`	`109`	`}`
`101`	`110`
`102`	`111`	`bool DataSet::addVector(const boost::scoped_array<double>& fvec,`
`103`		`- double target) {`
	`112`	`+ double target, double weight) {`
`104`	`113`	`if (examplesThresh_ != -1 && numExamples_ > examplesThresh_) {`
`105`	`114`	`return false;`
`106`	`115`	`}`
`@@ -128,6 +137,10 @@ bool DataSet::addVector(const boost::scoped_array<double>& fvec,`
`128`	`137`	`}`
`129`	`138`	`}`
`130`	`139`	`}`
	`140`	`+ if (weights_) {`
	`141`	`+ weights_->push_back(weight);`
	`142`	`+ }`
	`143`	`+`
`131`	`144`	`targets_.push_back(target);`
`132`	`145`	`numExamples_++;`
`133`	`146`