MultiDataset.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. /**
  2. * @file MultiDataset.cpp
  3. * @brief multiple datasets
  4. * @author Erik Rodner
  5. * @date 02/08/2008
  6. */
  7. #include <iostream>
  8. #include <sys/stat.h>
  9. #include <sys/types.h>
  10. #include "vislearning/cbaselib/ClassNames.h"
  11. #include "core/basics/StringTools.h"
  12. #include "core/basics/FileMgt.h"
  13. #include "vislearning/cbaselib/MultiDataset.h"
  14. using namespace OBJREC;
  15. using namespace std;
  16. using namespace NICE;
  17. #undef DEBUG_MultiDataset
  18. void MultiDataset::selectExamples ( const std::string & examples_command,
  19. const LabeledSet & base,
  20. LabeledSet & positives,
  21. LabeledSet & negatives,
  22. const ClassNames & cn ) const
  23. {
  24. vector<string> examples;
  25. StringTools::split ( examples_command, ';', examples );
  26. set<int> processed_classes;
  27. for ( vector<string>::const_iterator i = examples.begin();
  28. i != examples.end();
  29. i++ )
  30. {
  31. const std::string & cmd = *i;
  32. vector<string> parts;
  33. StringTools::split ( cmd, ' ', parts );
  34. if ( (parts.size() != 3) && ((parts.size() != 2) || (parts[0] != "all")) )
  35. fthrow( Exception, "Syntax error " << examples_command );
  36. const std::string & mode = parts[0];
  37. const std::string & csel = parts[1];
  38. double parameter = (parts.size() == 3 ) ? atof(parts[2].c_str()) : 0.0;
  39. map<int, int> fpe;
  40. set<int> selection;
  41. cn.getSelection ( csel, selection );
  42. for ( set<int>::const_iterator j = selection.begin();
  43. j != selection.end();
  44. j++ )
  45. {
  46. int classno = *j;
  47. if ( processed_classes.find(classno) == processed_classes.end() )
  48. {
  49. #ifdef DEBUG_MultiDataset
  50. fprintf (stderr, "class %s: %s %d\n", cn.text(classno).c_str(),
  51. mode.c_str(), (int)parameter );
  52. #endif
  53. fpe[*j] = (int)parameter;
  54. processed_classes.insert(classno);
  55. } else {
  56. if ( csel != "*" ) {
  57. fthrow ( Exception, "Example selection method for class %s has multiple specifications" << cn.text(classno) );
  58. }
  59. }
  60. }
  61. if ( mode == "seq" ) {
  62. LabeledSetSelection<LabeledSet>::selectSequential (
  63. fpe, base, positives, negatives );
  64. #ifdef DEBUG_MultiDataset
  65. fprintf (stderr, "MultiDataset: after special seq selection: %d\n", positives.count() );
  66. #endif
  67. } else if ( mode == "step" ) {
  68. LabeledSetSelection<LabeledSet>::selectSequentialStep (
  69. fpe, base, positives, negatives );
  70. #ifdef DEBUG_MultiDataset
  71. fprintf (stderr, "MultiDataset: after special step selection: %d\n", positives.count() );
  72. #endif
  73. } else if ( mode == "random" ) {
  74. LabeledSetSelection<LabeledSet>::selectRandom (
  75. fpe, base, positives, negatives );
  76. #ifdef DEBUG_MultiDataset
  77. fprintf (stderr, "MultiDataset: after special random selection: %d\n", positives.count() );
  78. #endif
  79. } else if ( mode == "all" ) {
  80. if ( (int)selection.size() == cn.numClasses() )
  81. {
  82. // preserve permutation
  83. LabeledSet::Permutation permutation;
  84. base.getPermutation ( permutation );
  85. for ( LabeledSet::Permutation::iterator i = permutation.begin(); i != permutation.end(); i++ )
  86. {
  87. int classno = i->first;
  88. ImageInfo *element = const_cast< ImageInfo * > ( i->second );
  89. positives.add_reference ( classno, element );
  90. }
  91. } else {
  92. LabeledSetSelection<LabeledSet>::selectClasses ( selection, base, positives, negatives );
  93. }
  94. #ifdef DEBUG_MultiDataset
  95. fprintf (stderr, "MultiDataset: after special class selection: %d\n", positives.count() );
  96. #endif
  97. } else {
  98. fthrow ( Exception, "Wrong value for parameter example\n");
  99. }
  100. }
  101. #ifdef DEBUG_MultiDataset
  102. fprintf (stderr, "MultiDataset: after special selection operations: %d\n", positives.count() );
  103. #endif
  104. set<int> allclasses;
  105. cn.getSelection ( "*", allclasses );
  106. set<int> allnegative_classes;
  107. // add all examples from allclasses \setminus processed_classes
  108. set_difference(allclasses.begin(), allclasses.end(), processed_classes.begin(), processed_classes.end(),
  109. inserter(allnegative_classes, allnegative_classes.end()));
  110. LabeledSet dummy;
  111. LabeledSetSelection<LabeledSet>::selectClasses ( allnegative_classes,
  112. base, negatives, dummy );
  113. }
  114. /** MultiDataset ------- constructor */
  115. MultiDataset::MultiDataset( const Config *conf , LabeledSetFactory *pSetFactory)
  116. {
  117. //read all blocks from our config file
  118. std::set<string> blocks;
  119. conf->getAllBlocks ( blocks );
  120. #ifdef DEBUG_MultiDataset
  121. std::cerr << "found the following config blocks: " << std::endl;
  122. for ( std::set<string>::const_iterator blockIt = blocks.begin(); blockIt != blocks.end(); blockIt++)
  123. {
  124. std::cerr << *blockIt << " ";
  125. }
  126. std::cerr << std::endl;
  127. #endif
  128. lfl.setFactory( pSetFactory );
  129. //for every dataset (e.g., train and test), we store a single confog file
  130. map<string, Config> dsconfs;
  131. //for every dataset (e.g., train and test), we store the position of the file directory
  132. map<string, string> dirs;
  133. //first of all, remove all blocks which do correspond to specified datasets, i.e., that do not contain a "dataset" entry
  134. for ( set<string>::iterator i = blocks.begin();
  135. i != blocks.end(); )
  136. {
  137. if ( conf->gB(*i, "disable", false) )
  138. {
  139. i++;
  140. continue;
  141. }
  142. std::string dataset = conf->gS( *i, "dataset", "unknown" );
  143. if ( dataset == "unknown" )
  144. blocks.erase(i++);
  145. else {
  146. #ifdef DEBUG_MultiDataset
  147. fprintf (stderr, "Reading dataset config for block [%s]\n", i->c_str() );
  148. #endif
  149. Config dsconf ( (dataset + "/dataset.conf").c_str() );
  150. dirs[*i] = dataset;
  151. dsconfs[*i] = dsconf;
  152. i++;
  153. }
  154. }
  155. #ifdef DEBUG_MultiDataset
  156. std::cerr << "found the following datasets within all config blocks: " << std::endl;
  157. for ( std::set<string>::const_iterator blockIt = blocks.begin(); blockIt != blocks.end(); blockIt++)
  158. {
  159. std::cerr << *blockIt << " ";
  160. }
  161. std::cerr << std::endl;
  162. #endif
  163. //is there a dataset specified that contains images for both, training and testing?
  164. if ( blocks.find("traintest") != blocks.end() )
  165. {
  166. LabeledSet ls_base;
  167. LabeledSet ls_train (true);
  168. LabeledSet ls_nontrain (true);
  169. LabeledSet ls_test (true);
  170. LabeledSet dummy (true);
  171. LabeledSet temp (true);
  172. bool localizationInfoDisabled = conf->gB("traintest", "disable_localization_info", false );
  173. std::string classselection_train = conf->gS("traintest", "classselection_train", "*");
  174. std::string classselection_test = conf->gS("traintest", "classselection_test", "*");
  175. classnames["traintest"] = ClassNames();
  176. std::string classNamesTxt = dirs["traintest"] + "/classnames.txt";
  177. if ( FileMgt::fileExists ( classNamesTxt ) )
  178. {
  179. classnames["traintest"].read ( classNamesTxt );
  180. } else {
  181. classnames["traintest"].readFromConfig ( dsconfs["traintest"], classselection_train );
  182. }
  183. lfl.get ( dirs["traintest"], dsconfs["traintest"], classnames["traintest"], ls_base,
  184. localizationInfoDisabled, conf->gB("traintest", "debug_dataset", false ) );
  185. std::string examples_train = conf->gS("traintest", "examples_train" );
  186. selectExamples ( examples_train, ls_base, ls_train, ls_nontrain, classnames["traintest"] );
  187. set<int> selection_test;
  188. classnames["traintest"].getSelection ( classselection_test, selection_test );
  189. std::string examples_test = conf->gS("traintest", "examples_test" );
  190. if ( examples_test == "reclassification" )
  191. {
  192. LabeledSetSelection<LabeledSet>::selectClasses
  193. ( selection_test, ls_train, ls_test, dummy );
  194. } else {
  195. selectExamples ( examples_test, ls_nontrain, temp, dummy, classnames["traintest"] );
  196. LabeledSetSelection<LabeledSet>::selectClasses
  197. ( selection_test, temp, ls_test, dummy );
  198. }
  199. classnames["train"] = classnames["traintest"];
  200. classnames["test"] = ClassNames ( classnames["traintest"], classselection_test );
  201. datasets["test"] = ls_test;
  202. datasets["train"] = ls_train;
  203. }
  204. //now read files for every specified dataset (e.g., train and test)
  205. for ( set<string>::const_iterator i = blocks.begin();
  206. i != blocks.end();
  207. i++ )
  208. {
  209. std::string name = *i;
  210. std::cerr << "read: " << name << std::endl;
  211. if ( classnames.find(name) != classnames.end() )
  212. continue;
  213. if ( conf->gB(name, "disable", false) == true )
  214. continue;
  215. if ( dsconfs.find(name) == dsconfs.end() )
  216. continue;
  217. LabeledSet ls_base;
  218. LabeledSet ls (true);
  219. LabeledSet dummy (true);
  220. LabeledSet temp (true);
  221. bool localizationInfoDisabled = conf->gB(name, "disable_localization_info", false );
  222. std::string classselection = conf->gS(name, "classselection", "*");
  223. classnames[name] = ClassNames();
  224. std::string classNamesTxt = dirs[name] + "/classnames.txt";
  225. if ( FileMgt::fileExists ( classNamesTxt ) )
  226. {
  227. #ifdef DEBUG_MultiDataset
  228. fprintf (stderr, "MultiDataset: reading class names from %s\n", classNamesTxt.c_str() );
  229. #endif
  230. classnames[name].read ( classNamesTxt );
  231. } else {
  232. #ifdef DEBUG_MultiDataset
  233. fprintf (stderr, "MultiDataset: reading class names from dataset config file\n" );
  234. #endif
  235. classnames[name].readFromConfig ( dsconfs[name], classselection );
  236. }
  237. #ifdef DEBUG_MultiDataset
  238. std::cerr << "we set up everything to read this dataset - so now call lfl.get" << std::endl;
  239. #endif
  240. lfl.get ( dirs[name],
  241. dsconfs[name],
  242. classnames[name],
  243. ls_base,
  244. localizationInfoDisabled,
  245. conf->gB(name, "debug_dataset", false ) );
  246. #ifdef DEBUG_MultiDataset
  247. fprintf (stderr, "MultiDataset: class names -->\n" );
  248. classnames[name].store ( cerr );
  249. fprintf (stderr, "MultiDataset: all information about %s set obtained ! (size %d)\n", name.c_str(), ls_base.count() );
  250. #endif
  251. #ifdef DEBUG_MultiDataset
  252. std::cerr << "we now call selectExamples to pick only a subset if desired" << std::endl;
  253. #endif
  254. std::string examples = conf->gS(name, "examples", "all *" );
  255. selectExamples ( examples, ls_base, ls, dummy, classnames[name] );
  256. #ifdef DEBUG_MultiDataset
  257. fprintf (stderr, "MultiDataset: size after selection %d\n", ls.count() );
  258. #endif
  259. datasets[name] = ls;
  260. }
  261. bool dumpSelections = conf->gB("datasets", "dump_selection", false);
  262. if ( dumpSelections )
  263. {
  264. for ( map<string, LabeledSet>::const_iterator i = datasets.begin();
  265. i != datasets.end(); i++ )
  266. {
  267. const std::string & name = i->first;
  268. const LabeledSet & ls = i->second;
  269. const ClassNames & classNames = classnames[name];
  270. mkdir ( name.c_str(), 0755 );
  271. std::string filelist = name + "/files.txt";
  272. ofstream olist ( filelist.c_str(), ios::out );
  273. if ( !olist.good() )
  274. fthrow (IOException, "Unable to dump selections to " << filelist );
  275. LOOP_ALL_S(ls)
  276. {
  277. EACH_S(classno, file);
  278. std::string classtext = classNames.code(classno);
  279. olist << classtext << " " << file << endl;
  280. }
  281. olist.close();
  282. std::string datasetconf = name + "/dataset.conf";
  283. ofstream oconf ( datasetconf.c_str(), ios::out );
  284. if ( !oconf.good() )
  285. fthrow (IOException, "Unable to dump selections to " << datasetconf );
  286. set<int> classnos;
  287. classNames.getSelection ( "*", classnos);
  288. oconf << "[main]" << endl;
  289. oconf << "filelist = \"files.txt\"" << endl << endl;
  290. oconf << "[classnames]" << endl;
  291. for ( set<int>::const_iterator i = classnos.begin();
  292. i != classnos.end(); i++ )
  293. {
  294. const std::string & code = classNames.code(*i);
  295. const std::string & text = classNames.text(*i);
  296. oconf << code << " = \"" << text << "\"" << endl;
  297. }
  298. oconf.close();
  299. classNames.save ( name + "/classnames.txt" );
  300. }
  301. }
  302. }
  303. MultiDataset::~MultiDataset()
  304. {
  305. }
  306. const ClassNames & MultiDataset::getClassNames ( const std::string & key ) const
  307. {
  308. map<string, ClassNames>::const_iterator i = classnames.find(key);
  309. if ( i == classnames.end() )
  310. {
  311. fprintf (stderr, "MultiDataSet::getClassNames() FATAL ERROR: dataset <%s> not found !\n", key.c_str() );
  312. exit(-1);
  313. }
  314. return (i->second);
  315. }
  316. const LabeledSet *MultiDataset::operator[] ( const std::string & key ) const
  317. {
  318. map<string, LabeledSet>::const_iterator i = datasets.find(key);
  319. if ( i == datasets.end() )
  320. {
  321. fprintf (stderr, "MultiDataSet: FATAL ERROR: dataset <%s> not found !\n", key.c_str() );
  322. exit(-1);
  323. }
  324. return &(i->second);
  325. }
  326. const LabeledSet *MultiDataset::at ( const std::string & key ) const
  327. {
  328. map<string, LabeledSet>::const_iterator i = datasets.find(key);
  329. if ( i == datasets.end() )
  330. return NULL;
  331. else
  332. return &(i->second);
  333. }