123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495 |
- import scipy.io
- import numpy
- import pickle
- import os
- import time
- import sys
- import helperFunctions
- ###
- #TODO: reuse readData functions for readDataForInit
- def readDataForInit(taskIdx, rndInitIdx, configFile):
- readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True)
- readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True)
- datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True)
- readAttempt = 0
- while True:
- try:
- if datatype == 'mat':
- xTrain, yTrain, xPool, yPool, xTest, yTest = readFromMatForInit(taskIdx, rndInitIdx, configFile)
- elif datatype == 'pickle':
- xTrain, yTrain, xPool, yPool, xTest, yTest = readFromPickleForInit(taskIdx, rndInitIdx, configFile)
- elif datatype == 'usps':
- xTrain, yTrain, xPool, yPool, xTest, yTest = readUSPSForInit(taskIdx, rndInitIdx, configFile)
- elif datatype == 'lfw':
- xTrain, yTrain, xPool, yPool, xTest, yTest = readLFWForInit(taskIdx, rndInitIdx, configFile)
- else:
- raise Exception('Unknown datatype %s!'%datatype)
- break
- except:
- readAttempt = readAttempt + 1
- if readAttempt >= readAttemptsNmb:
- raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt))
- print ''
- print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay)
- sys.stdout.flush()
- time.sleep(readAttemptsDelay)
- ###
- checkData(xTrain, yTrain, 'training')
- checkData(xPool, yPool, 'pool')
- checkData(xTest, yTest, 'test')
- ###
- return xTrain, yTrain, xPool, yPool, xTest, yTest
- ###
- def checkData(x, y, identifier):
- if x.shape[0] != y.shape[0]:
- raise Exception('{} data: #x = {} != #y = {}'.format(identifier, x.shape[0],y.shape[0]))
- if not numpy.all(numpy.isfinite(x)):
- raise Exception('{} data: not numpy.all(numpy.isfinite(x))'.format(identifier))
- if not numpy.all(numpy.isfinite(y)):
- raise Exception('{} data: not numpy.all(numpy.isfinite(y))'.format(identifier))
- ###
- def readFromMatForInit(taskIdx, rndInitIdx, configFile):
- dataFilePattern = helperFunctions.getConfig(configFile, 'data', 'dataFilePattern', None, 'str', True)
- mat = scipy.io.loadmat(dataFilePattern %(taskIdx + 1, rndInitIdx + 1))
- xTrain = numpy.asmatrix(mat['xTrain'], dtype=numpy.double)
- yTrain = numpy.asmatrix(mat['yTrain'], dtype=numpy.int)
- xPool = numpy.asmatrix(mat['xPool'], dtype=numpy.double)
- yPool = numpy.asmatrix(mat['yPool'], dtype=numpy.int)
- xTest = numpy.asmatrix(mat['xTest'], dtype=numpy.double)
- yTest = numpy.asmatrix(mat['yTest'], dtype=numpy.int)
- return xTrain, yTrain, xPool, yPool, xTest, yTest
- ###
- def readFromPickleForInit(taskIdx, rndInitIdx, configFile):
- indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
- trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
- testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
- noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
- forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True)
- ###
- pickleIn = open(indicesFileName)
- indices = pickle.load(pickleIn)
- pickleIn.close()
- trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
- testIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
- ###
- pickleIn = open(testFileName)
- testData = pickle.load(pickleIn)
- pickleIn.close()
- yTest = numpy.asmatrix(testData['y'][testIdxs,:], dtype=numpy.int)
- xTest = numpy.asmatrix(testData['X'][testIdxs,:], dtype=numpy.double)
- del testData
- ###
- pickleIn = open(trainFileName)
- trainData = pickle.load(pickleIn)
- pickleIn.close()
- yTrain = numpy.asmatrix(trainData['y'][trainIdxs,:], dtype=numpy.int)
- xTrain = numpy.asmatrix(trainData['X'][trainIdxs,:], dtype=numpy.double)
- ###
- poolIdxs = numpy.delete(numpy.asarray(range(trainData['y'].shape[0])), trainIdxs)
- yPool = numpy.asmatrix(trainData['y'][poolIdxs,:])
- xPool = numpy.asmatrix(trainData['X'][poolIdxs,:])
- del trainData
- for fCLs in forbiddenCls:
- idxs = numpy.where(numpy.asarray(yPool) == fCLs)
- yPool = numpy.delete(yPool, idxs[0], axis=0)
- xPool = numpy.delete(xPool, idxs[0], axis=0)
- pickleIn = open(noiseFileName)
- noiseData = pickle.load(pickleIn)
- pickleIn.close()
- yPool = numpy.append(yPool, numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0)
- xPool = numpy.append(xPool, numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0)
- del noiseData
- ###
- return xTrain, yTrain, xPool, yPool, xTest, yTest
- ###
- def readUSPSForInit(taskIdx, rndInitIdx, configFile):
- indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
- trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
- testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
- noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True)
- ###
- trainData = scipy.io.loadmat(trainFileName)['optdigits']
- testData = scipy.io.loadmat(testFileName)['optdigits']
- noiseData = scipy.io.loadmat(noiseFileName)['optdigits']
- pickleIn = open(indicesFileName)
- indices = pickle.load(pickleIn)
- pickleIn.close()
- trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
- poolIdxs = numpy.delete(numpy.asarray(range(trainData.shape[0])), trainIdxs)
- testIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
- ###
- xTrain = numpy.asmatrix(trainData[trainIdxs,:-1], dtype=numpy.double)
- yTrain = numpy.asmatrix(trainData[trainIdxs,-1], dtype=numpy.int).T
- xPool = numpy.asmatrix(trainData[poolIdxs,:-1], dtype=numpy.double)
- yPool = numpy.asmatrix(trainData[poolIdxs,-1], dtype=numpy.int).T
- xPool = numpy.append(xPool, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0)
- yPool = numpy.append(yPool, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
- xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double)
- yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T
- ###
- return xTrain, yTrain, xPool, yPool, xTest, yTest
- ###
- def readLFWForInit(taskIdx, rndInitIdx, configFile):
- indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True)
- dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True)
- numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True)
- splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True)
- numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True)
- ###
- mat = scipy.io.loadmat(dataFileName)
- labels = numpy.asmatrix(mat['labels'], dtype=numpy.int)
- data = numpy.asmatrix(mat['data'], dtype=numpy.float)
- if not os.path.isfile(splitFileName):
- print 'creating new split file ...'
- cls = numpy.unique(numpy.asarray(labels))
- noiseCls = list()
- noiseIdxs = numpy.empty((0,), dtype=int)
- trainIdxs = numpy.empty((0,), dtype=int)
- testIdxs = numpy.empty((0,), dtype=int)
- for clsIdx in range(len(cls)):
- clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx])))
- if len(clsIndices) < numMinSamples:
- noiseCls.append(cls[clsIdx])
- noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0)
- else:
- clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))]
- testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0)
- trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0)
- noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]]
- outputFile = open(splitFileName, 'w')
- pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile)
- outputFile.close()
- else:
- print 'loading split file ...'
- pickleIn = open(testFileName)
- splitIndices = pickle.load(pickleIn)
- pickleIn.close()
- noiseIdxs = splitIndices['noiseIdxs']
- trainIdxs = splitIndices['trainIdxs']
- testIdxs = splitIndices['testIdxs']
- ###
- xData = data[trainIdxs,:]
- yData = labels[trainIdxs,:]
- xData = numpy.append(xData, data[noiseIdxs,:], axis=0)
- yData = numpy.append(yData, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
- xTest = data[testIdxs,:]
- yTest = labels[testIdxs,:]
- ###
- pickleIn = open(indicesFileName)
- indices = pickle.load(pickleIn)
- pickleIn.close()
- curTrainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx]
- curPoolIdxs = numpy.delete(numpy.asarray(range(yData.shape[0])), trainIdxs)
- curTestIdxs = indices['testIdxs'][taskIdx][rndInitIdx]
- ###
- xTrain = xData[curTrainIdxs,:]
- yTrain = yData[curTrainIdxs,:]
- xPool = xData[curPoolIdxs,:]
- yPool = yData[curPoolIdxs,:]
- xTest = xTest[curTestIdxs,:]
- yTest = yTest[curTestIdxs,:]
- ###
- return xTrain, yTrain, xTest, yTest
- ###
- def readData(configFile):
- readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True)
- readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True)
- datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True)
- readAttempt = 0
- while True:
- try:
- if datatype == 'pickle':
- xTrain, yTrain, xTest, yTest = readFromPickle(configFile)
- elif datatype == 'usps':
- xTrain, yTrain, xTest, yTest = readUSPS(configFile)
- elif datatype == 'lfw':
- xTrain, yTrain, xTest, yTest = readLFW(configFile)
- else:
- raise Exception('Unknown datatype %s!'%datatype)
- break
- except:
- readAttempt = readAttempt + 1
- if readAttempt >= readAttemptsNmb:
- raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt))
- print ''
- print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay)
- sys.stdout.flush()
- time.sleep(readAttemptsDelay)
- ###
- checkData(xTrain, yTrain, 'training')
- checkData(xTest, yTest, 'test')
- ###
- return xTrain, yTrain, xTest, yTest
- ###
- def readFromPickle(configFile):
- trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
- testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
- noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
- forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True)
- ###
- pickleIn = open(testFileName)
- testData = pickle.load(pickleIn)
- pickleIn.close()
- yTest = numpy.asmatrix(testData['y'], dtype=numpy.int)
- xTest = numpy.asmatrix(testData['X'], dtype=numpy.double)
- del testData
- ###
- pickleIn = open(trainFileName)
- trainData = pickle.load(pickleIn)
- pickleIn.close()
- pickleIn = open(noiseFileName)
- noiseData = pickle.load(pickleIn)
- pickleIn.close()
- yTrain = numpy.append(trainData['y'], numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0)
- xTrain = numpy.append(trainData['X'], numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0)
- del trainData
- del noiseData
- ###
- return xTrain, yTrain, xTest, yTest
- ###
- def readUSPS(configFile):
- trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True)
- testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True)
- noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True)
- ###
- trainData = scipy.io.loadmat(trainFileName)['optdigits']
- testData = scipy.io.loadmat(testFileName)['optdigits']
- noiseData = scipy.io.loadmat(noiseFileName)['optdigits']
- ###
- xTrain = numpy.asmatrix(trainData[:,:-1], dtype=numpy.double)
- yTrain = numpy.asmatrix(trainData[:,-1], dtype=numpy.int).T
- xTrain = numpy.append(xTrain, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0)
- yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
- xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double)
- yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T
- ###
- return xTrain, yTrain, xTest, yTest
- ###
- def readLFW(configFile):
- dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True)
- numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True)
- numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True)
- splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True)
- numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True)
- ###
- mat = scipy.io.loadmat(dataFileName)
- labels = numpy.asmatrix(mat['labels'], dtype=numpy.int)
- data = numpy.asmatrix(mat['data'], dtype=numpy.float)
- if not os.path.isfile(splitFileName):
- print 'creating new split file ...'
- cls = numpy.unique(numpy.asarray(labels))
- noiseCls = list()
- noiseIdxs = numpy.empty((0,), dtype=int)
- trainIdxs = numpy.empty((0,), dtype=int)
- testIdxs = numpy.empty((0,), dtype=int)
- for clsIdx in range(len(cls)):
- clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx])))
- if len(clsIndices) < numMinSamples:
- noiseCls.append(cls[clsIdx])
- noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0)
- else:
- clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))]
- testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0)
- trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0)
- noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]]
- outputFile = open(splitFileName, 'w')
- pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile)
- outputFile.close()
- else:
- print 'loading split file ...'
- pickleIn = open(testFileName)
- splitIndices = pickle.load(pickleIn)
- pickleIn.close()
- noiseIdxs = splitIndices['noiseIdxs']
- trainIdxs = splitIndices['trainIdxs']
- testIdxs = splitIndices['testIdxs']
- ###
- xTrain = data[trainIdxs,:]
- yTrain = labels[trainIdxs,:]
- xTrain = numpy.append(xTrain, data[noiseIdxs,:], axis=0)
- yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0)
- xTest = data[testIdxs,:]
- yTest = labels[testIdxs,:]
- ###
- return xTrain, yTrain, xTest, yTest
|