RelevantIntervalSelection.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. import numpy as np
  2. from TSGeneration import genSynchronizedTimeSeries
  3. # from TSPlot import plotTS
  4. def mutual_information_short(hgram):
  5. s = float(np.sum(hgram))
  6. hgram = hgram / s
  7. px = np.sum(hgram, axis=1)
  8. py = np.sum(hgram, axis=0)
  9. px_py = px[:, None] * py[None, :]
  10. nzs = hgram > 0
  11. return np.sum(hgram[nzs] * np.log(hgram[nzs] / px_py[nzs]))
  12. def ts_distance(distancemeasure,X,Y,bins):
  13. """
  14. Similarity measures that can be called in relevant interval selection
  15. approach. Either Peasron correlation or Normalized mutual information.
  16. New similarity measures can be added with other elif statements.
  17. :param distancemeasure: 'pearson' or 'mutual'
  18. :param X: Time series
  19. :param Y: Time series
  20. :param bins: Bins for mutual information computation
  21. :return: returns similarity of X and Y
  22. """
  23. import warnings
  24. if distancemeasure == 'pearson':
  25. with warnings.catch_warnings():
  26. warnings.simplefilter("ignore")
  27. r = np.corrcoef(X, Y)[0, 1]
  28. elif distancemeasure == 'mutual':
  29. #compute best number of bins
  30. #breaks program sometimes...
  31. #k = 10
  32. #sigma = np.corrcoef(X, Y)[0][1]
  33. #if not np.isnan(sigma) or sigma == 1:
  34. # N = len(X)
  35. # k = int(np.ceil(0.5 + 0.5 * np.sqrt(
  36. # 1 + 4 * np.sqrt((6 * N * sigma * sigma) / (
  37. # 1 - (sigma * sigma))))))
  38. # Calculate histograms
  39. concat = np.hstack((X, Y))
  40. l, h = np.nanmin(concat), np.nanmax(concat)
  41. hgram = np.histogram2d(X, Y,
  42. range=[[l, h], [l, h]],
  43. bins=bins)[0]
  44. r = mutual_information_short(hgram)
  45. # Normalize mutual information
  46. r = np.sqrt(1 - np.exp(-2 * r))
  47. return r
  48. def LongestSet(IntList):
  49. """
  50. Calculate longest set of maximal correlated intervals
  51. :param IntList: 2d list of intervals, e.g. [[0,2], [1,2], [1,4], [4,3],
  52. [7,2], [8,2], [4,3]]
  53. :return: numpy array as list of intervals where sum of length of
  54. intervals is largest regarding all possible combinations of
  55. non-overlapping intervals
  56. """
  57. if type(IntList) == np.ndarray:
  58. IntList = [(x[0], x[1]) for x in IntList]
  59. k = len(IntList)
  60. if k == 0:
  61. return 0
  62. IntList = np.array(sorted(IntList))
  63. NextInt = {}
  64. # Compute immediate starting intervals after i ends
  65. for i, Interval in enumerate(IntList):
  66. end = Interval[0] + Interval[1] # Interval[0] is the starting frame and Interval[1] is the duration
  67. nextidx = IntList[IntList[:, 0] >= end]
  68. if nextidx.size > 0:
  69. minstart = np.min(nextidx[:, 0])
  70. NextInt[i] = np.argwhere(IntList[:, 0] >= minstart)
  71. else:
  72. NextInt[i] = None
  73. # Calculate cumulated sum iterating by end of the list
  74. CumSum = np.zeros(k)
  75. SelInts = {}
  76. for idx, Interval in enumerate(reversed(IntList)):
  77. i = (k - 1) - idx
  78. b = NextInt[i]
  79. if np.all(b != None):
  80. # print(
  81. # CumSum[NextInt[i]] + Interval[1])
  82. bestfollower = int(NextInt[i][np.argmax(
  83. CumSum[NextInt[i]] + Interval[1])])
  84. CumSum[i] = int(Interval[1] + CumSum[bestfollower])
  85. if Interval[1] + CumSum[bestfollower] >= CumSum[i + 1]:
  86. SelInts[i] = bestfollower
  87. else:
  88. CumSum[i] = Interval[1]
  89. SelInts[i] = None
  90. # Loop forward
  91. Result = np.array([])
  92. current = np.where(CumSum == CumSum.max())[0][-1]
  93. while True:
  94. intval = IntList[current]
  95. Result = np.append(Result, [intval])
  96. current = SelInts[current]
  97. if current not in SelInts:
  98. break
  99. Result = Result.reshape(-1, 2)
  100. return Result
  101. def LAMINABO(X, Y, threshold, minlen, maxlen, distancemeasure=None):
  102. """
  103. Computes longest Set of maximal correlated intervals, as described in
  104. Atluri et al. paper.
  105. :param X: time series
  106. :param Y: times series
  107. :param threshold: threshold for interval correlation
  108. :param minlen: minimum length of interval
  109. :param maxlen: maximum length of interval
  110. :return: returns list of intervals in form [start_index, length] of
  111. correlated, maximal, distinct intervals
  112. """
  113. indexlength = min(len(X), len(Y))
  114. IntMat = np.zeros([indexlength, maxlen-minlen+1], dtype=np.int8)
  115. # calculate correlation for minimum interval length
  116. winlen = minlen
  117. for i in range(indexlength-minlen+1):
  118. r = ts_distance(distancemeasure, X[i:i + winlen],
  119. Y[i:i + winlen], bins=10)
  120. if r >= threshold:
  121. IntMat[i, 0] = 1
  122. # calculate correlation for all intervallength > minimumlength
  123. for winlen in range(minlen+1, maxlen+1):
  124. for i in range(indexlength-winlen+1):
  125. if IntMat[i, winlen-1-minlen] == 1 and \
  126. IntMat[i+1, winlen-1-minlen] == 1:
  127. r = ts_distance(distancemeasure, X[i:i + winlen],
  128. Y[i:i + winlen], bins=10)
  129. if r >= threshold:
  130. IntMat[i, winlen-minlen] = 1
  131. CorInts = np.where(IntMat == 1)
  132. del IntMat
  133. CorInts = list(zip(CorInts[0], CorInts[1]+minlen))
  134. # check if correlated intervals are maximal
  135. ResultInt = []
  136. for i, lenWin in CorInts:
  137. if ((i - 1, lenWin + 1) not in CorInts) and ((i, lenWin + 1) not in CorInts):
  138. ResultInt += [(i, lenWin)]
  139. del CorInts
  140. return ResultInt
  141. def CalcLAMINA(X, Y, thresCorrelation, minLenInt,
  142. maxLenInt, shiftTS, distancemeasure):
  143. """
  144. Shifts time series to left an right and computes Longest set of maximal
  145. correlated intervals for each shift
  146. :param X: Time series
  147. :param Y: Time series
  148. :param thresCorrelation: Threshold for similarity measure
  149. :param minLenInt: Minimum length of intervals
  150. :param maxLenInt: Maximum length of intervals
  151. :param shiftTS: Shift time series
  152. :param distancemeasure:
  153. :return: List of lists. Each list contains maximal correlated
  154. intervals of each shift.
  155. """
  156. import multiprocessing as mp
  157. IntervalsEACH = []
  158. # Outer loop through pairs and conditions
  159. num_cores = mp.cpu_count()
  160. pool = mp.Pool(num_cores)
  161. # argss1 and argss2 are used to get all shifts (no-shift, forward and backward) and all minimum length combinations.
  162. argss1 = [(X[shift:], Y[:-shift], thresCorrelation, minLen, maxLenInt, distancemeasure) if shift != 0
  163. else (X, Y, thresCorrelation, minLen, maxLenInt, distancemeasure)
  164. for shift in shiftTS for minLen in minLenInt]
  165. argss2 = [(X[:-shift], Y[shift:], thresCorrelation, minLen, maxLenInt, distancemeasure)
  166. for shift in shiftTS for minLen in minLenInt if shift != 0]
  167. args = argss1 + argss2
  168. IntervalsEACH = pool.starmap(LAMINABO, args)
  169. pool.close()
  170. return IntervalsEACH
  171. def selectRelevantIntervals(X, Y, shifts, threshold, minLenInts, maxLenInts,
  172. distancemeasure=None):
  173. """
  174. Compute selected relevant intervals of two time series
  175. :param X: array with time series values
  176. :param Y: array with time series values
  177. :param shifts: list of values with which X is shifted back and forth in
  178. time.
  179. :param threshold: Threshold for similarity measures
  180. :param minLenInt: Minimum length of intervals
  181. :param maxLenInt: Maximum length of intervals
  182. :param shiftTS: Shift time series
  183. :return: Selected relevant intervals
  184. """
  185. IntervalsEACH = CalcLAMINA(X, Y, threshold, minLenInts, maxLenInts, shifts, distancemeasure)
  186. if len(IntervalsEACH) > 0:
  187. Intervals = [int for intlist in IntervalsEACH for int in intlist] # flatten the array
  188. if len(Intervals) > 0:
  189. relevant_Intervals = LongestSet(Intervals)
  190. return relevant_Intervals
  191. if __name__ == '__main__':
  192. """
  193. This is an example for relevant interval computation
  194. """
  195. # parameters
  196. distancemeasure = 'pearson'
  197. shifts = [0, 4, 8, 12]
  198. threshold = 0.8
  199. minLenInts = [75, 80]
  200. maxLenInts = 400
  201. # generate synchronised time series
  202. X, Y, syncI = genSynchronizedTimeSeries(lenTS=500)
  203. expressions_x = '/home/valapil/Project/ForkCausal_Adithya/test/'
  204. Windows = selectRelevantIntervals(X, Y, shifts, threshold, minLenInts,
  205. maxLenInts, distancemeasure=distancemeasure)
  206. print('Adaptable Windows: \n', Windows)
  207. print('Ground truth: \n', syncI)
  208. # plotTS(X,Y,CI=Windows,groundtruth=syncI)