src/IO/InputData.cpp

00001 /*
00002 * This file is part of MultiBoost, a multi-class 
00003 * AdaBoost learner/classifier
00004 *
00005 * Copyright (C) 2005 Norman Casagrande
00006 * For informations write to nova77@gmail.com
00007 *
00008 * This library is free software; you can redistribute it and/or
00009 * modify it under the terms of the GNU Lesser General Public
00010 * License as published by the Free Software Foundation; either
00011 * version 2.1 of the License, or (at your option) any later version.
00012 *
00013 * This library is distributed in the hope that it will be useful,
00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 * Lesser General Public License for more details.
00017 *
00018 * You should have received a copy of the GNU Lesser General Public
00019 * License along with this library; if not, write to the Free Software
00020 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00021 *
00022 */
00023 
00024 // Indexes: i = loop on examples
00025 //          j = loop on columns
00026 //          l = loop on classes
00027 
00028 #include <iostream> // for cerr
00029 #include <algorithm> // for sort
00030 #include <functional> // for less
00031 
00032 #include "Utils/Utils.h" // for white_tabs
00033 #include "IO/InputData.h"
00034 
00035 #include "Defaults.h" // for MB_DEBUG
00036 
00037 #include "IO/ClassMappings.h"
00038 
00039 namespace MultiBoost {
00040 
00041 // ------------------------------------------------------------------------
00042 
00050 InputData::~InputData()
00051 {
00052    for (int i = 0; i < _numExamples; ++i)
00053       delete [] _data[i].pValues;
00054 }
00055 
00056 void InputData::initOptions(nor_utils::Args& args)
00057 {
00059    // check if the input file has a filename for each example
00060    if ( args.hasArgument("-hasfilename") )
00061       _hasFileName = true;
00062 
00064    // check if the class is at the last column of the data file
00065    if ( args.hasArgument("-classend") )
00066       _classInLastColumn = true;
00067 }
00068 
00069 // ------------------------------------------------------------------------
00070 
00071 void InputData::load(const string& fileName, const eInputType inputType, const int verboseLevel)
00072 {
00073    if (verboseLevel > 0)
00074    {
00075       cout << "Loading file " << fileName << "...";
00076       cout.flush();
00077    }
00078 
00079    ifstream inFile(fileName.c_str());
00080    if ( !inFile.is_open() )
00081    {
00082       cerr << "\nERROR: Cannot open file <" << fileName << ">!!" << endl;
00083       exit(1);
00084    }
00085 
00086    // set white spaces to consider tab as NOT whitespace
00087    // the white_tab will be erased automatically by fstream
00088    inFile.imbue( locale(locale(), new nor_utils::white_tab) );
00089 
00090    _numColumns = (int)nor_utils::count_columns(inFile);
00091    inFile.clear(); // reset position
00092    inFile.seekg(0);
00093 
00094    // if it has a filename for each example, don't count it
00095    if (_hasFileName)
00096       --_numColumns;
00097 
00098    // the class is not a data column
00099    --_numColumns;
00100 
00101    // this array will be filled with the values from the example.
00102    // We need this to be sure we are not storing fake data because we reached
00103    // the end of the file
00104    double* pDataArray = NULL;
00105 
00106    string tmpFileName;
00107    string tmpClassName;
00108 
00109    int classIndex = 0;
00110    _numExamples = 0;
00111 
00112    map<int, int> tmpPointsPerClass;
00113 
00115    while( !inFile.eof() ) 
00116    {
00117       if (_hasFileName)
00118          inFile >> tmpFileName; // store file name
00119 
00120       if (!_classInLastColumn)
00121          inFile >> tmpClassName; // store class
00122 
00123       pDataArray = new double[_numColumns];
00124       if (!pDataArray)
00125       {
00126          cerr << "ERROR: Cannot allocate memory for storage!" << endl;
00127          exit(1);
00128       }
00129 
00130       for (int j = 0; j < _numColumns; ++j)
00131          inFile >> pDataArray[j]; // store values
00132 
00133       // to avoid problems in the case of an empty line at the end
00134       // of the file
00135       if ( inFile.eof() )
00136       {
00137          delete [] pDataArray;
00138          break;
00139       }
00140 
00141       if (_classInLastColumn)
00142          inFile >> tmpClassName; // store class
00143 
00144       int classIdx = ClassMappings::addClassName(tmpClassName);
00145       tmpPointsPerClass[ classIdx ]++;
00146 
00147       _data.push_back(Example(pDataArray, classIdx, tmpFileName));
00148       ++_numExamples;
00149    } 
00150 
00152 
00153    const int numClasses = ClassMappings::getNumClasses();
00154 
00155    for (int l = 0; l < numClasses; ++l)
00156       _nExamplesPerClass.push_back( tmpPointsPerClass[l] );
00157 
00158    // Initialize weights
00159    initWeights();
00160 
00161 #if MB_DEBUG
00162    // Checks if there is no variance on the columns
00163    checkVariances();
00164 #endif
00165 
00166    if (verboseLevel > 0)
00167    {
00168       cout << "Done!" << endl;
00169 
00170       if (verboseLevel > 1)
00171       {
00172          cout << "Num Columns = " << _numColumns << endl;  
00173 
00174          for (int l = 0; l < numClasses; ++l)
00175             cout << "Of class '" << ClassMappings::getClassNameFromIdx(l) << "': " 
00176                  << _nExamplesPerClass[l] << endl;
00177 
00178          cout << "Total: " << _numExamples << " examples read." << endl;
00179       }
00180    } 
00181 
00182 }
00183 
00184 // ------------------------------------------------------------------------
00185 
00186 // Initialize weights
00187 void InputData::initWeights()
00188 {
00189    const int numClasses = ClassMappings::getNumClasses();
00190 
00191    for (int i = 0; i < _numExamples; ++i)
00192    {
00193       _data[i].weights.resize(numClasses); // resize vector to the number of classes
00194 
00195       for (int l = 0; l < numClasses; ++l)
00196       {
00197          // basic formula for weight initialization: if the example [i] belongs to class [l]
00198          // then it's weights is 1 / (2*numExamples), 
00199          // otherwise it is 1 / (2 * numExamples * (numClasses-1) )
00200          if (l == _data[i].classIdx)
00201             _data[i].weights[l] = 1 / (2*(double)_numExamples);
00202          else
00203             _data[i].weights[l] = 1 / (2*(double)_numExamples*(double)(numClasses-1));
00204       }
00205    }
00206 }
00207 
00208 // ------------------------------------------------------------------------
00209 
00210 #if MB_DEBUG
00211 // Print a warning if there is no variance in a column.
00212 void InputData::checkVariances()
00213 {
00214    // for each column
00215    for (int j = 0; j < _numColumns; ++j)
00216    {
00217       double valueChk = getValue(0, j); // get value of the first example..
00218       bool hasVariance = false;
00219 
00220       for (int i = 1; i < _numExamples; ++i)
00221       {
00222          if ( getValue(i, j) != valueChk)
00223          {
00224             hasVariance = true; // the value has changed.. thus variance > 0
00225             break;
00226          }
00227       }
00228 
00229       if (!hasVariance)
00230          cerr << "WARNING!! Column " << j << " has no variance!" << endl;
00231    }
00232 }
00233 #endif // MB_DEBUG
00234 
00235 } // end of namespace MultiBoost

Generated on Mon Nov 28 21:43:46 2005 for MultiBoost by  doxygen 1.4.5