datasetAcquisition.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. import scipy.io
  2. import numpy
  3. import pickle
  4. import os
  5. import time
  6. import sys
  7. import helperFunctions
  8. ###
  9. #TODO: reuse readData functions for readDataForInit
  10. def readDataForInit(taskIdx, rndInitIdx, configFile):
  11. readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True)
  12. readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True)
  13. datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True)
  14. readAttempt = 0
  15. while True:
  16. try:
  17. if datatype == 'mat':
  18. xTrain, yTrain, xPool, yPool, xTest, yTest = readFromMatForInit(taskIdx, rndInitIdx, configFile)
  19. elif datatype == 'pickle':
  20. xTrain, yTrain, xPool, yPool, xTest, yTest = readFromPickleForInit(taskIdx, rndInitIdx, configFile)
  21. elif datatype == 'usps':
  22. xTrain, yTrain, xPool, yPool, xTest, yTest = readUSPSForInit(taskIdx, rndInitIdx, configFile)
  23. elif datatype == 'lfw':
  24. xTrain, yTrain, xPool, yPool, xTest, yTest = readLFWForInit(taskIdx, rndInitIdx, configFile)
  25. else:
  26. raise Exception('Unknown datatype %s!'%datatype)
  27. break
  28. except:
  29. readAttempt = readAttempt + 1
  30. if readAttempt >= readAttemptsNmb:
  31. raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt))
  32. print ''
  33. print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay)
  34. sys.stdout.flush()
  35. time.sleep(readAttemptsDelay)
  36. ###
  37. checkData(xTrain, yTrain, 'training')
  38. checkData(xPool, yPool, 'pool')
  39. checkData(xTest, yTest, 'test')
  40. ###
  41. return xTrain, yTrain, xPool, yPool, xTest, yTest
  42. ###
  43. def checkData(x, y, identifier):
  44. if x.shape[0] != y.shape[0]:
  45. raise Exception('{} data: #x = {} != #y = {}'.format(identifier, x.shape[0],y.shape[0]))
  46. if not numpy.all(numpy.isfinite(x)):
  47. raise Exception('{} data: not numpy.all(numpy.isfinite(x))'.format(identifier))
  48. if not numpy.all(numpy.isfinite(y)):
  49. raise Exception('{} data: not numpy.all(numpy.isfinite(y))'.format(identifier))
  50. ###
  51. def readFromMatForInit(taskIdx, rndInitIdx, configFile):
  52. dataFilePattern = helperFunctions.getConfig(configFile, 'data', 'dataFilePattern', None, 'str', True)
  53. mat = scipy.io.loadmat(dataFilePattern %(taskIdx + 1, rndInitIdx + 1))
  54. xTrain = numpy.asmatrix(mat['xTrain'], dtype=numpy.double)
  55. yTrain = numpy.asmatrix(mat['yTrain'], dtype=numpy.int)
  56. xPool = numpy.asmatrix(mat['xPool'], dtype=numpy.double)
  57. yPool = numpy.asmatrix(mat['yPool'], dtype=numpy.int)
  58. xTest = numpy.asmatrix(mat['xTest'], dtype=numpy.double)
  59. yTest = numpy.asmatrix(mat['yTest'], dtype=numpy.int)
  60. return xTrain, yTrain, xPool, yPool, xTest, yTest
  61. ###
  62. def readFromPickleForInit(taskIdx, rndInitIdx, configFile):
  63. indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
  64. trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
  65. testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
  66. noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
  67. forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True)
  68. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True)
  69. ###
  70. pickleIn = open(indicesFileName)
  71. indices = pickle.load(pickleIn)
  72. pickleIn.close()
  73. trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
  74. testIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
  75. ###
  76. pickleIn = open(testFileName)
  77. testData = pickle.load(pickleIn)
  78. pickleIn.close()
  79. yTest = numpy.asmatrix(testData['y'][testIdxs,:], dtype=numpy.int)
  80. xTest = numpy.asmatrix(testData['X'][testIdxs,:], dtype=numpy.double)
  81. del testData
  82. ###
  83. pickleIn = open(trainFileName)
  84. trainData = pickle.load(pickleIn)
  85. pickleIn.close()
  86. yTrain = numpy.asmatrix(trainData['y'][trainIdxs,:], dtype=numpy.int)
  87. xTrain = numpy.asmatrix(trainData['X'][trainIdxs,:], dtype=numpy.double)
  88. ###
  89. poolIdxs = numpy.delete(numpy.asarray(range(trainData['y'].shape[0])), trainIdxs)
  90. yPool = numpy.asmatrix(trainData['y'][poolIdxs,:])
  91. xPool = numpy.asmatrix(trainData['X'][poolIdxs,:])
  92. del trainData
  93. for fCLs in forbiddenCls:
  94. idxs = numpy.where(numpy.asarray(yPool) == fCLs)
  95. yPool = numpy.delete(yPool, idxs[0], axis=0)
  96. xPool = numpy.delete(xPool, idxs[0], axis=0)
  97. pickleIn = open(noiseFileName)
  98. noiseData = pickle.load(pickleIn)
  99. pickleIn.close()
  100. yPool = numpy.append(yPool, numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0)
  101. xPool = numpy.append(xPool, numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0)
  102. del noiseData
  103. ###
  104. return xTrain, yTrain, xPool, yPool, xTest, yTest
  105. ###
  106. def readUSPSForInit(taskIdx, rndInitIdx, configFile):
  107. indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
  108. trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
  109. testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
  110. noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
  111. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True)
  112. ###
  113. trainData = scipy.io.loadmat(trainFileName)['optdigits']
  114. testData = scipy.io.loadmat(testFileName)['optdigits']
  115. noiseData = scipy.io.loadmat(noiseFileName)['optdigits']
  116. pickleIn = open(indicesFileName)
  117. indices = pickle.load(pickleIn)
  118. pickleIn.close()
  119. trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
  120. poolIdxs = numpy.delete(numpy.asarray(range(trainData.shape[0])), trainIdxs)
  121. testIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
  122. ###
  123. xTrain = numpy.asmatrix(trainData[trainIdxs,:-1], dtype=numpy.double)
  124. yTrain = numpy.asmatrix(trainData[trainIdxs,-1], dtype=numpy.int).T
  125. xPool = numpy.asmatrix(trainData[poolIdxs,:-1], dtype=numpy.double)
  126. yPool = numpy.asmatrix(trainData[poolIdxs,-1], dtype=numpy.int).T
  127. xPool = numpy.append(xPool, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0)
  128. yPool = numpy.append(yPool, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
  129. xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double)
  130. yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T
  131. ###
  132. return xTrain, yTrain, xPool, yPool, xTest, yTest
  133. ###
  134. def readLFWForInit(taskIdx, rndInitIdx, configFile):
  135. indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
  136. dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True)
  137. numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True)
  138. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True)
  139. splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True)
  140. numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True)
  141. ###
  142. mat = scipy.io.loadmat(dataFileName)
  143. labels = numpy.asmatrix(mat['labels'], dtype=numpy.int)
  144. data = numpy.asmatrix(mat['data'], dtype=numpy.float)
  145. if not os.path.isfile(splitFileName):
  146. print 'creating new split file ...'
  147. cls = numpy.unique(numpy.asarray(labels))
  148. noiseCls = list()
  149. noiseIdxs = numpy.empty((0,), dtype=int)
  150. trainIdxs = numpy.empty((0,), dtype=int)
  151. testIdxs = numpy.empty((0,), dtype=int)
  152. for clsIdx in range(len(cls)):
  153. clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx])))
  154. if len(clsIndices) < numMinSamples:
  155. noiseCls.append(cls[clsIdx])
  156. noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0)
  157. else:
  158. clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))]
  159. testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0)
  160. trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0)
  161. noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]]
  162. outputFile = open(splitFileName, 'w')
  163. pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile)
  164. outputFile.close()
  165. else:
  166. print 'loading split file ...'
  167. pickleIn = open(testFileName)
  168. splitIndices = pickle.load(pickleIn)
  169. pickleIn.close()
  170. noiseIdxs = splitIndices['noiseIdxs']
  171. trainIdxs = splitIndices['trainIdxs']
  172. testIdxs = splitIndices['testIdxs']
  173. ###
  174. xData = data[trainIdxs,:]
  175. yData = labels[trainIdxs,:]
  176. xData = numpy.append(xData, data[noiseIdxs,:], axis=0)
  177. yData = numpy.append(yData, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
  178. xTest = data[testIdxs,:]
  179. yTest = labels[testIdxs,:]
  180. ###
  181. pickleIn = open(indicesFileName)
  182. indices = pickle.load(pickleIn)
  183. pickleIn.close()
  184. curTrainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
  185. curPoolIdxs = numpy.delete(numpy.asarray(range(yData.shape[0])), trainIdxs)
  186. curTestIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
  187. ###
  188. xTrain = xData[curTrainIdxs,:]
  189. yTrain = yData[curTrainIdxs,:]
  190. xPool = xData[curPoolIdxs,:]
  191. yPool = yData[curPoolIdxs,:]
  192. xTest = xTest[curTestIdxs,:]
  193. yTest = yTest[curTestIdxs,:]
  194. ###
  195. return xTrain, yTrain, xTest, yTest
  196. ###
  197. def readData(configFile):
  198. readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True)
  199. readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True)
  200. datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True)
  201. readAttempt = 0
  202. while True:
  203. try:
  204. if datatype == 'pickle':
  205. xTrain, yTrain, xTest, yTest = readFromPickle(configFile)
  206. elif datatype == 'usps':
  207. xTrain, yTrain, xTest, yTest = readUSPS(configFile)
  208. elif datatype == 'lfw':
  209. xTrain, yTrain, xTest, yTest = readLFW(configFile)
  210. else:
  211. raise Exception('Unknown datatype %s!'%datatype)
  212. break
  213. except:
  214. readAttempt = readAttempt + 1
  215. if readAttempt >= readAttemptsNmb:
  216. raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt))
  217. print ''
  218. print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay)
  219. sys.stdout.flush()
  220. time.sleep(readAttemptsDelay)
  221. ###
  222. checkData(xTrain, yTrain, 'training')
  223. checkData(xTest, yTest, 'test')
  224. ###
  225. return xTrain, yTrain, xTest, yTest
  226. ###
  227. def readFromPickle(configFile):
  228. trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
  229. testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
  230. noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
  231. forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True)
  232. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True)
  233. ###
  234. pickleIn = open(testFileName)
  235. testData = pickle.load(pickleIn)
  236. pickleIn.close()
  237. yTest = numpy.asmatrix(testData['y'], dtype=numpy.int)
  238. xTest = numpy.asmatrix(testData['X'], dtype=numpy.double)
  239. del testData
  240. ###
  241. pickleIn = open(trainFileName)
  242. trainData = pickle.load(pickleIn)
  243. pickleIn.close()
  244. pickleIn = open(noiseFileName)
  245. noiseData = pickle.load(pickleIn)
  246. pickleIn.close()
  247. yTrain = numpy.append(trainData['y'], numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0)
  248. xTrain = numpy.append(trainData['X'], numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0)
  249. del trainData
  250. del noiseData
  251. ###
  252. return xTrain, yTrain, xTest, yTest
  253. ###
  254. def readUSPS(configFile):
  255. trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
  256. testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
  257. noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
  258. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True)
  259. ###
  260. trainData = scipy.io.loadmat(trainFileName)['optdigits']
  261. testData = scipy.io.loadmat(testFileName)['optdigits']
  262. noiseData = scipy.io.loadmat(noiseFileName)['optdigits']
  263. ###
  264. xTrain = numpy.asmatrix(trainData[:,:-1], dtype=numpy.double)
  265. yTrain = numpy.asmatrix(trainData[:,-1], dtype=numpy.int).T
  266. xTrain = numpy.append(xTrain, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0)
  267. yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
  268. xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double)
  269. yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T
  270. ###
  271. return xTrain, yTrain, xTest, yTest
  272. ###
  273. def readLFW(configFile):
  274. dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True)
  275. numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True)
  276. numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True)
  277. splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True)
  278. numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True)
  279. ###
  280. mat = scipy.io.loadmat(dataFileName)
  281. labels = numpy.asmatrix(mat['labels'], dtype=numpy.int)
  282. data = numpy.asmatrix(mat['data'], dtype=numpy.float)
  283. if not os.path.isfile(splitFileName):
  284. print 'creating new split file ...'
  285. cls = numpy.unique(numpy.asarray(labels))
  286. noiseCls = list()
  287. noiseIdxs = numpy.empty((0,), dtype=int)
  288. trainIdxs = numpy.empty((0,), dtype=int)
  289. testIdxs = numpy.empty((0,), dtype=int)
  290. for clsIdx in range(len(cls)):
  291. clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx])))
  292. if len(clsIndices) < numMinSamples:
  293. noiseCls.append(cls[clsIdx])
  294. noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0)
  295. else:
  296. clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))]
  297. testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0)
  298. trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0)
  299. noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]]
  300. outputFile = open(splitFileName, 'w')
  301. pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile)
  302. outputFile.close()
  303. else:
  304. print 'loading split file ...'
  305. pickleIn = open(testFileName)
  306. splitIndices = pickle.load(pickleIn)
  307. pickleIn.close()
  308. noiseIdxs = splitIndices['noiseIdxs']
  309. trainIdxs = splitIndices['trainIdxs']
  310. testIdxs = splitIndices['testIdxs']
  311. ###
  312. xTrain = data[trainIdxs,:]
  313. yTrain = labels[trainIdxs,:]
  314. xTrain = numpy.append(xTrain, data[noiseIdxs,:], axis=0)
  315. yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
  316. xTest = data[testIdxs,:]
  317. yTest = labels[testIdxs,:]
  318. ###
  319. return xTrain, yTrain, xTest, yTest