Random Forest训练方法介绍

本节演示如何使用shark的random forest
random forest和核心思路是train许多weak learner(这里是decision tree),然后使用majority vote将所有这些weak learner统合起来得出最终结论

#include <iostream>
#include <shark/Data/Csv.h>
#include <shark/Algorithms/Trainers/RFTrainer.h>
#include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h>
using namespace shark;
using std::cout;
using std::endl;

int main(int argc, const char * argv[])
{
    try
    {
        //1.读取数据
        ClassificationDataset dataTrain;
        importCSV(dataTrain, "C.csv", LAST_COLUMN, ' ');
        ClassificationDataset dataTest = splitAtElement(dataTrain,311);
        //2.训练
        RFTrainer<unsigned int> trainer;        //trainer有默认参数,也可以手动调整
        //重要的有setMTry(随机选用多少feature作为单个decision tree的input),setNTrees(决定使用多少decision tree),setNodeSize(决定剪枝的threshold,值越大树高越矮),setOOBratio(out-of-bag sample)
        RFClassifier<unsigned int> model;
        trainer.train(model, dataTrain);
        //3.评估
        ZeroOneLoss<> loss;
        auto prediction = model(dataTrain.inputs());
        cout << "Random Forest on training set accuracy: " << 1. - loss.eval(dataTrain.labels(), prediction) << endl;
        prediction = model(dataTest.inputs());
        cout << "Random Forest on test set accuracy:     " << 1. - loss.eval(dataTest.labels(), prediction) << endl;
    }
    catch (const std::exception &e)
    {
        cout << e.what() << endl;
    }
    return 0;
}