import scipy.io import numpy import pickle import os import time import sys import helperFunctions ### #TODO: reuse readData functions for readDataForInit def readDataForInit(taskIdx, rndInitIdx, configFile): readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True) readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True) datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True) readAttempt = 0 while True: try: if datatype == 'mat': xTrain, yTrain, xPool, yPool, xTest, yTest = readFromMatForInit(taskIdx, rndInitIdx, configFile) elif datatype == 'pickle': xTrain, yTrain, xPool, yPool, xTest, yTest = readFromPickleForInit(taskIdx, rndInitIdx, configFile) elif datatype == 'usps': xTrain, yTrain, xPool, yPool, xTest, yTest = readUSPSForInit(taskIdx, rndInitIdx, configFile) elif datatype == 'lfw': xTrain, yTrain, xPool, yPool, xTest, yTest = readLFWForInit(taskIdx, rndInitIdx, configFile) else: raise Exception('Unknown datatype %s!'%datatype) break except: readAttempt = readAttempt + 1 if readAttempt >= readAttemptsNmb: raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt)) print '' print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay) sys.stdout.flush() time.sleep(readAttemptsDelay) ### checkData(xTrain, yTrain, 'training') checkData(xPool, yPool, 'pool') checkData(xTest, yTest, 'test') ### return xTrain, yTrain, xPool, yPool, xTest, yTest ### def checkData(x, y, identifier): if x.shape[0] != y.shape[0]: raise Exception('{} data: #x = {} != #y = {}'.format(identifier, x.shape[0],y.shape[0])) if not numpy.all(numpy.isfinite(x)): raise Exception('{} data: not numpy.all(numpy.isfinite(x))'.format(identifier)) if not numpy.all(numpy.isfinite(y)): raise Exception('{} data: not numpy.all(numpy.isfinite(y))'.format(identifier)) ### def readFromMatForInit(taskIdx, rndInitIdx, configFile): dataFilePattern = helperFunctions.getConfig(configFile, 'data', 'dataFilePattern', None, 'str', True) mat = scipy.io.loadmat(dataFilePattern %(taskIdx + 1, rndInitIdx + 1)) xTrain = numpy.asmatrix(mat['xTrain'], dtype=numpy.double) yTrain = numpy.asmatrix(mat['yTrain'], dtype=numpy.int) xPool = numpy.asmatrix(mat['xPool'], dtype=numpy.double) yPool = numpy.asmatrix(mat['yPool'], dtype=numpy.int) xTest = numpy.asmatrix(mat['xTest'], dtype=numpy.double) yTest = numpy.asmatrix(mat['yTest'], dtype=numpy.int) return xTrain, yTrain, xPool, yPool, xTest, yTest ### def readFromPickleForInit(taskIdx, rndInitIdx, configFile): indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True) trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True) testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True) noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True) forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True) ### pickleIn = open(indicesFileName) indices = pickle.load(pickleIn) pickleIn.close() trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx] testIdxs = indices['testIdxs'][taskIdx][rndInitIdx] ### pickleIn = open(testFileName) testData = pickle.load(pickleIn) pickleIn.close() yTest = numpy.asmatrix(testData['y'][testIdxs,:], dtype=numpy.int) xTest = numpy.asmatrix(testData['X'][testIdxs,:], dtype=numpy.double) del testData ### pickleIn = open(trainFileName) trainData = pickle.load(pickleIn) pickleIn.close() yTrain = numpy.asmatrix(trainData['y'][trainIdxs,:], dtype=numpy.int) xTrain = numpy.asmatrix(trainData['X'][trainIdxs,:], dtype=numpy.double) ### poolIdxs = numpy.delete(numpy.asarray(range(trainData['y'].shape[0])), trainIdxs) yPool = numpy.asmatrix(trainData['y'][poolIdxs,:]) xPool = numpy.asmatrix(trainData['X'][poolIdxs,:]) del trainData for fCLs in forbiddenCls: idxs = numpy.where(numpy.asarray(yPool) == fCLs) yPool = numpy.delete(yPool, idxs[0], axis=0) xPool = numpy.delete(xPool, idxs[0], axis=0) pickleIn = open(noiseFileName) noiseData = pickle.load(pickleIn) pickleIn.close() yPool = numpy.append(yPool, numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0) xPool = numpy.append(xPool, numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0) del noiseData ### return xTrain, yTrain, xPool, yPool, xTest, yTest ### def readUSPSForInit(taskIdx, rndInitIdx, configFile): indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True) trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True) testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True) noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True) ### trainData = scipy.io.loadmat(trainFileName)['optdigits'] testData = scipy.io.loadmat(testFileName)['optdigits'] noiseData = scipy.io.loadmat(noiseFileName)['optdigits'] pickleIn = open(indicesFileName) indices = pickle.load(pickleIn) pickleIn.close() trainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx] poolIdxs = numpy.delete(numpy.asarray(range(trainData.shape[0])), trainIdxs) testIdxs = indices['testIdxs'][taskIdx][rndInitIdx] ### xTrain = numpy.asmatrix(trainData[trainIdxs,:-1], dtype=numpy.double) yTrain = numpy.asmatrix(trainData[trainIdxs,-1], dtype=numpy.int).T xPool = numpy.asmatrix(trainData[poolIdxs,:-1], dtype=numpy.double) yPool = numpy.asmatrix(trainData[poolIdxs,-1], dtype=numpy.int).T xPool = numpy.append(xPool, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0) yPool = numpy.append(yPool, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0) xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double) yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T ### return xTrain, yTrain, xPool, yPool, xTest, yTest ### def readLFWForInit(taskIdx, rndInitIdx, configFile): indicesFileName = helperFunctions.getConfig(configFile, 'data', 'indicesFileName', None, 'str', True) dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True) numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True) splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True) numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True) ### mat = scipy.io.loadmat(dataFileName) labels = numpy.asmatrix(mat['labels'], dtype=numpy.int) data = numpy.asmatrix(mat['data'], dtype=numpy.float) if not os.path.isfile(splitFileName): print 'creating new split file ...' cls = numpy.unique(numpy.asarray(labels)) noiseCls = list() noiseIdxs = numpy.empty((0,), dtype=int) trainIdxs = numpy.empty((0,), dtype=int) testIdxs = numpy.empty((0,), dtype=int) for clsIdx in range(len(cls)): clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx]))) if len(clsIndices) < numMinSamples: noiseCls.append(cls[clsIdx]) noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0) else: clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))] testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0) trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0) noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]] outputFile = open(splitFileName, 'w') pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile) outputFile.close() else: print 'loading split file ...' pickleIn = open(testFileName) splitIndices = pickle.load(pickleIn) pickleIn.close() noiseIdxs = splitIndices['noiseIdxs'] trainIdxs = splitIndices['trainIdxs'] testIdxs = splitIndices['testIdxs'] ### xData = data[trainIdxs,:] yData = labels[trainIdxs,:] xData = numpy.append(xData, data[noiseIdxs,:], axis=0) yData = numpy.append(yData, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0) xTest = data[testIdxs,:] yTest = labels[testIdxs,:] ### pickleIn = open(indicesFileName) indices = pickle.load(pickleIn) pickleIn.close() curTrainIdxs = indices['trainIdxs'][taskIdx][rndInitIdx] curPoolIdxs = numpy.delete(numpy.asarray(range(yData.shape[0])), trainIdxs) curTestIdxs = indices['testIdxs'][taskIdx][rndInitIdx] ### xTrain = xData[curTrainIdxs,:] yTrain = yData[curTrainIdxs,:] xPool = xData[curPoolIdxs,:] yPool = yData[curPoolIdxs,:] xTest = xTest[curTestIdxs,:] yTest = yTest[curTestIdxs,:] ### return xTrain, yTrain, xTest, yTest ### def readData(configFile): readAttemptsNmb = helperFunctions.getConfig(configFile, 'data', 'readAttemptsNmb', 5, 'int', True) readAttemptsDelay = helperFunctions.getConfig(configFile, 'data', 'readAttemptsDelay', 30, 'int', True) datatype = helperFunctions.getConfig(configFile, 'data', 'datatype', None, 'str', True) readAttempt = 0 while True: try: if datatype == 'pickle': xTrain, yTrain, xTest, yTest = readFromPickle(configFile) elif datatype == 'usps': xTrain, yTrain, xTest, yTest = readUSPS(configFile) elif datatype == 'lfw': xTrain, yTrain, xTest, yTest = readLFW(configFile) else: raise Exception('Unknown datatype %s!'%datatype) break except: readAttempt = readAttempt + 1 if readAttempt >= readAttemptsNmb: raise Exception('ERROR: Reading datatype {} failed {} times!'.format(datatype, readAttempt)) print '' print 'WARNING: Reading datatype {} failed ({} / {}, retry after {} seconds)!'.format(datatype, readAttempt, readAttemptsNmb, readAttemptsDelay) sys.stdout.flush() time.sleep(readAttemptsDelay) ### checkData(xTrain, yTrain, 'training') checkData(xTest, yTest, 'test') ### return xTrain, yTrain, xTest, yTest ### def readFromPickle(configFile): trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True) testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True) noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True) forbiddenCls = helperFunctions.getConfig(configFile, 'data', 'forbiddenCls', [], 'intList', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 20000, 'int', True) ### pickleIn = open(testFileName) testData = pickle.load(pickleIn) pickleIn.close() yTest = numpy.asmatrix(testData['y'], dtype=numpy.int) xTest = numpy.asmatrix(testData['X'], dtype=numpy.double) del testData ### pickleIn = open(trainFileName) trainData = pickle.load(pickleIn) pickleIn.close() pickleIn = open(noiseFileName) noiseData = pickle.load(pickleIn) pickleIn.close() yTrain = numpy.append(trainData['y'], numpy.asmatrix(noiseData['y'][:numNoiseSamples,:], dtype=numpy.int), axis=0) xTrain = numpy.append(trainData['X'], numpy.asmatrix(noiseData['X'][:numNoiseSamples,:], dtype=numpy.double), axis=0) del trainData del noiseData ### return xTrain, yTrain, xTest, yTest ### def readUSPS(configFile): trainFileName = helperFunctions.getConfig(configFile, 'data', 'trainFileName', None, 'str', True) testFileName = helperFunctions.getConfig(configFile, 'data', 'testFileName', None, 'str', True) noiseFileName = helperFunctions.getConfig(configFile, 'data', 'noiseFileName', None, 'str', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 1797, 'int', True) ### trainData = scipy.io.loadmat(trainFileName)['optdigits'] testData = scipy.io.loadmat(testFileName)['optdigits'] noiseData = scipy.io.loadmat(noiseFileName)['optdigits'] ### xTrain = numpy.asmatrix(trainData[:,:-1], dtype=numpy.double) yTrain = numpy.asmatrix(trainData[:,-1], dtype=numpy.int).T xTrain = numpy.append(xTrain, numpy.asmatrix(noiseData[:numNoiseSamples,:], dtype=numpy.double), axis=0) yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0) xTest = numpy.asmatrix(testData[:,:-1], dtype=numpy.double) yTest = numpy.asmatrix(testData[:,-1], dtype=numpy.int).T ### return xTrain, yTrain, xTest, yTest ### def readLFW(configFile): dataFileName = helperFunctions.getConfig(configFile, 'data', 'dataFileName', None, 'str', True) numMinSamples = helperFunctions.getConfig(configFile, 'data', 'numMinSamples', 55, 'int', True) numNoiseSamples = helperFunctions.getConfig(configFile, 'data', 'numNoiseSamples', 400, 'int', True) splitFileName = helperFunctions.getConfig(configFile, 'data', 'splitFileName', os.path.join(os.path.dirname(dataFileName),'lfwRndTestSplit.pickle'), 'str', True) numTestSplit = helperFunctions.getConfig(configFile, 'data', 'numTestSplit', 30, 'int', True) ### mat = scipy.io.loadmat(dataFileName) labels = numpy.asmatrix(mat['labels'], dtype=numpy.int) data = numpy.asmatrix(mat['data'], dtype=numpy.float) if not os.path.isfile(splitFileName): print 'creating new split file ...' cls = numpy.unique(numpy.asarray(labels)) noiseCls = list() noiseIdxs = numpy.empty((0,), dtype=int) trainIdxs = numpy.empty((0,), dtype=int) testIdxs = numpy.empty((0,), dtype=int) for clsIdx in range(len(cls)): clsIndices = numpy.ravel(numpy.asarray(numpy.where(numpy.ravel(numpy.asarray(labels)) == cls[clsIdx]))) if len(clsIndices) < numMinSamples: noiseCls.append(cls[clsIdx]) noiseIdxs = numpy.append(noiseIdxs, clsIndices, axis=0) else: clsIndices = clsIndices[numpy.random.permutation(len(clsIndices))] testIdxs = numpy.append(testIdxs, clsIndices[:numTestSplit], axis=0) trainIdxs = numpy.append(trainIdxs, clsIndices[numTestSplit:], axis=0) noiseIdxs = noiseIdxs[numpy.random.permutation(len(noiseIdxs))[:numNoiseSamples]] outputFile = open(splitFileName, 'w') pickle.dump({'noiseCls':noiseCls, 'noiseIdxs':noiseIdxs, 'trainIdxs':trainIdxs, 'testIdxs':testIdxs}, outputFile) outputFile.close() else: print 'loading split file ...' pickleIn = open(testFileName) splitIndices = pickle.load(pickleIn) pickleIn.close() noiseIdxs = splitIndices['noiseIdxs'] trainIdxs = splitIndices['trainIdxs'] testIdxs = splitIndices['testIdxs'] ### xTrain = data[trainIdxs,:] yTrain = labels[trainIdxs,:] xTrain = numpy.append(xTrain, data[noiseIdxs,:], axis=0) yTrain = numpy.append(yTrain, numpy.asmatrix(numpy.ones((numNoiseSamples,1))*(-1.0), dtype=numpy.int), axis=0) xTest = data[testIdxs,:] yTest = labels[testIdxs,:] ### return xTrain, yTrain, xTest, yTest