class cv::ml::TrainData

Overview

Class encapsulating training data. More…

#include <ml.hpp>

class TrainData
{
public:
    // methods

    static
    Ptr<TrainData>
    create(
        InputArray samples,
        int layout,
        InputArray responses,
        InputArray varIdx = noArray(),
        InputArray sampleIdx = noArray(),
        InputArray sampleWeights = noArray(),
        InputArray varType = noArray()
        );

    static
    Mat
    getSubVector(
        const Mat& vec,
        const Mat& idx
        );

    static
    Ptr<TrainData>
    loadFromCSV(
        const String& filename,
        int headerLineCount,
        int responseStartIdx = -1,
        int responseEndIdx = -1,
        const String& varTypeSpec = String(),
        char delimiter = ',',
        char missch = '?'
        );

    static
    float
    missingValue();

    virtual
    int
    getCatCount(int vi) const = 0;

    virtual
    Mat
    getCatMap() const = 0;

    virtual
    Mat
    getCatOfs() const = 0;

    virtual
    Mat
    getClassLabels() const = 0;

    virtual
    Mat
    getDefaultSubstValues() const = 0;

    virtual
    int
    getLayout() const = 0;

    virtual
    Mat
    getMissing() const = 0;

    virtual
    int
    getNAllVars() const = 0;

    void
    getNames(std::vector<String>& names) const;

    virtual
    Mat
    getNormCatResponses() const = 0;

    virtual
    void
    getNormCatValues(
        int vi,
        InputArray sidx,
        int* values
        ) const = 0;

    virtual
    int
    getNSamples() const = 0;

    virtual
    int
    getNTestSamples() const = 0;

    virtual
    int
    getNTrainSamples() const = 0;

    virtual
    int
    getNVars() const = 0;

    virtual
    Mat
    getResponses() const = 0;

    virtual
    int
    getResponseType() const = 0;

    virtual
    void
    getSample(
        InputArray varIdx,
        int sidx,
        float* buf
        ) const = 0;

    virtual
    Mat
    getSamples() const = 0;

    virtual
    Mat
    getSampleWeights() const = 0;

    virtual
    Mat
    getTestNormCatResponses() const = 0;

    virtual
    Mat
    getTestResponses() const = 0;

    virtual
    Mat
    getTestSampleIdx() const = 0;

    Mat
    getTestSamples() const;

    virtual
    Mat
    getTestSampleWeights() const = 0;

    virtual
    Mat
    getTrainNormCatResponses() const = 0;

    virtual
    Mat
    getTrainResponses() const = 0;

    virtual
    Mat
    getTrainSampleIdx() const = 0;

    virtual
    Mat
    getTrainSamples(
        int layout = ROW_SAMPLE,
        bool compressSamples = true,
        bool compressVars = true
        ) const = 0;

    virtual
    Mat
    getTrainSampleWeights() const = 0;

    virtual
    void
    getValues(
        int vi,
        InputArray sidx,
        float* values
        ) const = 0;

    virtual
    Mat
    getVarIdx() const = 0;

    Mat
    getVarSymbolFlags() const;

    virtual
    Mat
    getVarType() const = 0;

    virtual
    void
    setTrainTestSplit(
        int count,
        bool shuffle = true
        ) = 0;

    virtual
    void
    setTrainTestSplitRatio(
        double ratio,
        bool shuffle = true
        ) = 0;

    virtual
    void
    shuffleTrainTest() = 0;
};

Detailed Documentation

Class encapsulating training data.

Please note that the class only specifies the interface of training data, but not implementation. All the statistical model classes in ml module accepts Ptr <TrainData> as parameter. In other words, you can create your own class derived from TrainData and pass smart pointer to the instance of this class into StatModel::train.

See also:

Training Data

Methods

static
Ptr<TrainData>
create(
    InputArray samples,
    int layout,
    InputArray responses,
    InputArray varIdx = noArray(),
    InputArray sampleIdx = noArray(),
    InputArray sampleWeights = noArray(),
    InputArray varType = noArray()
    )

Creates training data from in-memory arrays.

Parameters:

samples matrix of samples. It should have CV_32F type.
layout see ml::SampleTypes.
responses matrix of responses. If the responses are scalar, they should be stored as a single row or as a single column. The matrix should have type CV_32F or CV_32S (in the former case the responses are considered as ordered by default; in the latter case - as categorical)
varIdx vector specifying which variables to use for training. It can be an integer vector (CV_32S) containing 0-based variable indices or byte vector (CV_8U) containing a mask of active variables.
sampleIdx vector specifying which samples to use for training. It can be an integer vector (CV_32S) containing 0-based sample indices or byte vector (CV_8U) containing a mask of training samples.
sampleWeights optional vector with weights for each sample. It should have CV_32F type.
varType optional vector of type CV_8U and size <number_of_variables_in_samples> + <number_of_variables_in_responses>, containing types of each input and output variable. See ml::VariableTypes.
static
Ptr<TrainData>
loadFromCSV(
    const String& filename,
    int headerLineCount,
    int responseStartIdx = -1,
    int responseEndIdx = -1,
    const String& varTypeSpec = String(),
    char delimiter = ',',
    char missch = '?'
    )

Reads the dataset from a .csv file and returns the ready-to-use training data.

If the dataset only contains input variables and no responses, use responseStartIdx = -2 and responseEndIdx = 0. The output variables vector will just contain zeros.

Parameters:

filename The input file name
headerLineCount The number of lines in the beginning to skip; besides the header, the function also skips empty lines and lines staring with #
responseStartIdx Index of the first output variable. If -1, the function considers the last variable as the response
responseEndIdx Index of the last output variable + 1. If -1, then there is single response variable at responseStartIdx.
varTypeSpec

The optional text string that specifies the variables’ types. It has the format ord[n1-n2,n3,n4-n5,...]cat[n6,n7-n8,...]. That is, variables from n1 to n2 (inclusive range), n3, n4 to n5… are considered ordered and n6, n7 to n8… are considered as categorical. The range [n1..n2] + [n3] + [n4..n5] + ... + [n6] + [n7..n8] should cover all the variables. If varTypeSpec is not specified, then algorithm uses the following rules:

  • all input variables are considered ordered by default. If some column contains has non- numerical values, e.g. ‘apple’, ‘pear’, ‘apple’, ‘apple’, ‘mango’, the corresponding variable is considered categorical.
  • if there are several output variables, they are all considered as ordered. Error is reported when non-numerical values are used.
  • if there is a single output variable, then if its values are non-numerical or are all integers, then it’s considered categorical. Otherwise, it’s considered ordered.
delimiter The character used to separate values in each line.
missch The character used to specify missing measurements. It should not be a digit. Although it’s a non-numerical value, it surely does not affect the decision of whether the variable ordered or categorical.
virtual
Mat
getClassLabels() const = 0

Returns the vector of class labels.

The function returns vector of unique labels occurred in the responses.

void
getNames(std::vector<String>& names) const

Returns vector of symbolic names captured in loadFromCSV()

Mat
getTestSamples() const

Returns matrix of test samples.

virtual
Mat
getTrainNormCatResponses() const = 0

Returns the vector of normalized categorical responses.

The function returns vector of responses. Each response is integer from 0 to <number of classes>-1. The actual label value can be retrieved then from the class label vector, see TrainData::getClassLabels.

virtual
Mat
getTrainResponses() const = 0

Returns the vector of responses.

The function returns ordered or the original categorical responses. Usually it’s used in regression algorithms.

virtual
Mat
getTrainSamples(
    int layout = ROW_SAMPLE,
    bool compressSamples = true,
    bool compressVars = true
    ) const = 0

Returns matrix of train samples.

In current implementation the function tries to avoid physical data copying and returns the matrix stored inside TrainData (unless the transposition or compression is needed).

Parameters:

layout The requested layout. If it’s different from the initial one, the matrix is transposed. See ml::SampleTypes.
compressSamples if true, the function returns only the training samples (specified by sampleIdx)
compressVars if true, the function returns the shorter training samples, containing only the active variables.
virtual
void
setTrainTestSplit(
    int count,
    bool shuffle = true
    ) = 0

Splits the training data into the training and test parts.

See also:

TrainData::setTrainTestSplitRatio

virtual
void
setTrainTestSplitRatio(
    double ratio,
    bool shuffle = true
    ) = 0

Splits the training data into the training and test parts.

The function selects a subset of specified relative size and then returns it as the training set. If the function is not called, all the data is used for training. Please, note that for each of TrainData::getTrain* there is corresponding TrainData::getTest*, so that the test subset can be retrieved and processed as well.

See also:

TrainData::setTrainTestSplit