hace 4 meses · 3b05d7d641
--- a/src/preprocessing/transform_data.py
+++ b/src/preprocessing/transform_data.py
@@ -3,7 +3,24 @@ import pandas as pd
 
				 
			
 
				 from src.plotter import Plotter
			
 
				 
			
 
				-def transform_general_to_SIR(plotter:Plotter, dataset_path='datasets/COVID-19-Todesfaelle_in_Deutschland/', plot_name='', plot_title='', sample_rate=1, exclude=[], plot_size=(12,6), yscale_log=False, plot_legend=True):
			
 
				+state_lookup = {'Schleswig Holstein' : (1, 2897000),
			
 
				+                'Hamburg' : (2, 1841000), 
			
 
				+                'Niedersachsen' : (3, 7982000), 
			
 
				+                'Bremen' : (4, 569352),
			
 
				+                'Nordrhein-Westfalen' : (5, 17930000),
			
 
				+                'Hessen' : (6, 6266000),
			
 
				+                'Rheinland-Pfalz' : (7, 4085000),
			
 
				+                'Baden-Württemberg' : (8, 11070000),
			
 
				+                'Bayern' : (9, 13080000),
			
 
				+                'Saarland' : (10, 990509),
			
 
				+                'Berlin' : (11, 3645000),
			
 
				+                'Brandenburg' : (12, 2641000),
			
 
				+                'Mecklenburg-Vorpommern' : (13, 1610000),
			
 
				+                'Sachsen' : (14, 4078000),
			
 
				+                'Sachsen-Anhalt' : (15, 2208000),
			
 
				+                'Thüringen' : (16, 2143000)}
			
 
				+
			
 
				+def transform_data(plotter:Plotter, alpha=1/14, state_name='Germany', time_range=1200, plot_name='', plot_title='', sample_rate=1, model='SIR', plot_size=(12,6), yscale_log=False, plot_legend=True):
			
 
				     """Function to generate the SIR split from the data in the COVID-19-Todesfaelle_in_Deutschland dataset.
			
 
				 
			
 
				     Args:
			
@@ -18,147 +35,80 @@ def transform_general_to_SIR(plotter:Plotter, dataset_path='datasets/COVID-19-To
 
				         plot_legend (bool, optional): Controls if the legend is to be plotted. Defaults to True.
			
 
				     """
			
 
				     # read the data
			
 
				-    df = pd.read_csv(dataset_path + 'COVID-19-Todesfaelle_Deutschland.csv')
			
 
				-
			
 
				-    df = df.drop(df.index[1200:])
			
 
				-    
			
 
				-    # population of germany at the end of 2019
			
 
				-    N = 83100000
			
 
				-    S, I, R = np.zeros(df.shape[0]), np.zeros(df.shape[0]), np.zeros(df.shape[0])
			
 
				-
			
 
				-    # S_0 = N - I_0
			
 
				-    S[0] = N - df['Faelle_gesamt'][0]
			
 
				-    # I_0 = overall cases at the day - overall death cases at the day
			
 
				-    I[0] = df['Faelle_gesamt'][0] - df['Todesfaelle_gesamt'][0]
			
 
				-    # R_0 = overall death cases at the day
			
 
				-    R[0] = df['Todesfaelle_gesamt'][0]
			
 
				-
			
 
				-    # the recovery time is 14 days
			
 
				-    recovery_queue = np.zeros(14)
			
 
				-    
			
 
				-    for day in range(1, df.shape[0]):
			
 
				-        infections = df['Faelle_gesamt'][day] - df['Faelle_gesamt'][day-1]
			
 
				-        deaths = df['Todesfaelle_neu'][day]
			
 
				-        recoveries = recovery_queue[0]
			
 
				-
			
 
				-        S[day] = S[day-1] - infections
			
 
				-        I[day] = I[day-1] + infections - deaths - recoveries
			
 
				-        R[day] = R[day-1] + deaths + recoveries
			
 
				-
			
 
				-        # update recovery queue
			
 
				-        if I[day] < 0:
			
 
				-            recovery_queue[-1] -= I[day] 
			
 
				-            I[day] = 0
			
 
				-
			
 
				-        recovery_queue[:-1] = recovery_queue[1:]
			
 
				-        recovery_queue[-1] = infections
			
 
				-
			
 
				-    t = np.arange(0, df.shape[0], 1)
			
 
				-    if plotter != None:
			
 
				-        # plot graphs
			
 
				-        plots = []
			
 
				-        labels = []
			
 
				-
			
 
				-        if 'S' not in exclude:
			
 
				-            plots.append(S)
			
 
				-            labels.append('S')
			
 
				-        
			
 
				-        if 'I' not in exclude:
			
 
				-            plots.append(I)
			
 
				-            labels.append('I')
			
 
				-
			
 
				-        if 'R' not in exclude:
			
 
				-            plots.append(R)
			
 
				-            labels.append('R')
			
 
				-
			
 
				-        plotter.plot(t, plots, labels, plot_name, plot_title, plot_size, y_log_scale=yscale_log, plot_legend=plot_legend, xlabel='time / days', ylabel='amount of poeple')
			
 
				-
			
 
				-    COVID_Data = np.asarray([t[0::sample_rate], 
			
 
				-                             S[0::sample_rate], 
			
 
				-                             I[0::sample_rate], 
			
 
				-                             R[0::sample_rate]]) 
			
 
				-
			
 
				-    np.savetxt(f"datasets/SIR_RKI_{sample_rate}.csv", COVID_Data, delimiter=",")
			
 
				-
			
 
				 
			
 
				 
			
 
				-def get_state_cases(county_id, state_id):
			
 
				-    id = county_id // 1000
			
 
				-    return id == state_id
			
 
				-
			
 
				-def state_based_data(plotter:Plotter, state_name:str, model='SIR', alpha=1/14, time_range=1200, sample_rate=1, dataset_path='datasets/state_data/Aktuell_Deutschland_SarsCov2_Infektionen.csv'):
			
 
				-    """Transforms the RKI infection cases dataset to a SIR dataset.
			
 
				-
			
 
				-    Args:
			
 
				-        plotter (Plotter): Plotter object to plot dataset curves.
			
 
				-        state_name (str): Name of the state that is to be singled out in the new dataset.
			
 
				-        time_range (int, optional): Number of days that will be looked at in the new dataset. Defaults to 1200.
			
 
				-        sample_rate (int, optional): Sample rate used to sample the timepoints. Defaults to 1.
			
 
				-        dataset_path (str, optional): Path to the CSV file, where the data is stored. Defaults to 'datasets/state_data/Aktuell_Deutschland_SarsCov2_Infektionen.csv'.
			
 
				-    """
			
 
				-    df = pd.read_csv(dataset_path)
			
 
				-
			
 
				-    state_lookup = {'Schleswig Holstein' : (1, 2897000),
			
 
				-                    'Hamburg' : (2, 1841000), 
			
 
				-                    'Niedersachsen' : (3, 7982000), 
			
 
				-                    'Bremen' : (4, 569352),
			
 
				-                    'Nordrhein-Westfalen' : (5, 17930000),
			
 
				-                    'Hessen' : (6, 6266000),
			
 
				-                    'Rheinland-Pfalz' : (7, 4085000),
			
 
				-                    'Baden-Württemberg' : (8, 11070000),
			
 
				-                    'Bayern' : (9, 13080000),
			
 
				-                    'Saarland' : (10, 990509),
			
 
				-                    'Berlin' : (11, 3645000),
			
 
				-                    'Brandenburg' : (12, 2641000),
			
 
				-                    'Mecklenburg-Vorpommern' : (13, 1610000),
			
 
				-                    'Sachsen' : (14, 4078000),
			
 
				-                    'Sachsen-Anhalt' : (15, 2208000),
			
 
				-                    'Thüringen' : (16, 2143000)}
			
 
				-    state_ID, N = state_lookup[state_name]
			
 
				-
			
 
				-    # single out a state
			
 
				-    state_IDs = df['IdLandkreis'] // 1000
			
 
				-    state_df = df.loc[state_IDs == state_ID]
			
 
				-
			
 
				-    # sort entries by state
			
 
				-    state_df = state_df.sort_values('Refdatum')
			
 
				-    state_df = state_df.reset_index(drop=True)
			
 
				-
			
 
				-
			
 
				-    # collect cases    
			
 
				     infections = np.zeros(time_range)
			
 
				-    dead = np.zeros(time_range)
			
 
				-    recovered = np.zeros(time_range)
			
 
				-    entry_idx = 0
			
 
				-    day = 0
			
 
				-    date = state_df['Refdatum'][entry_idx]
			
 
				-    # check for each date all entries
			
 
				-    while day < time_range:
			
 
				-        # use the date sorted characteristic and take all entries with current date
			
 
				-        while state_df['Refdatum'][entry_idx] == date:
			
 
				-            # TODO use further parameters
			
 
				-            infections[day] += state_df['AnzahlFall'][entry_idx]
			
 
				-            dead[day] += state_df['AnzahlTodesfall'][entry_idx]
			
 
				-            recovered[day] += state_df['AnzahlGenesen'][entry_idx]
			
 
				-            entry_idx += 1
			
 
				-        # move day index by difference between the current and next date
			
 
				-        day += (pd.to_datetime(state_df['Refdatum'][entry_idx])-pd.to_datetime(date)).days
			
 
				-        date = state_df['Refdatum'][entry_idx]
			
 
				-
			
 
				-    S = np.zeros(time_range)
			
 
				-    I = np.zeros(time_range)
			
 
				-    R = np.zeros(time_range)
			
 
				-
			
 
				+    deaths = np.zeros(time_range)
			
 
				+    recoveries = np.zeros(time_range)
			
 
				+    if state_name == 'Germany':
			
 
				+        df = pd.read_csv('datasets/COVID-19-Todesfaelle_in_Deutschland/COVID-19-Todesfaelle_Deutschland.csv')
			
 
				+        N = 83100000
			
 
				+        infections[0] = df['Faelle_gesamt'][0]
			
 
				+        deaths[0] = df['Todesfaelle_neu'][0]
			
 
				+
			
 
				+        recovery_queue = np.zeros(14)
			
 
				+        for i in range(1, time_range):
			
 
				+            infections[i] = df['Faelle_gesamt'][i] - df['Faelle_gesamt'][i-1]
			
 
				+            deaths[i] = df['Todesfaelle_neu'][i]
			
 
				+            recoveries[i] = recovery_queue[0]
			
 
				+
			
 
				+            recovery_queue[:-1] = recovery_queue[1:]
			
 
				+            recovery_queue[-1] = infections[i]
			
 
				+    else:
			
 
				+        df = pd.read_csv('datasets/state_data/Aktuell_Deutschland_SarsCov2_Infektionen.csv')
			
 
				+        state_ID, N = state_lookup[state_name]
			
 
				+
			
 
				+        # single out a state
			
 
				+        state_IDs = df['IdLandkreis'] // 1000
			
 
				+        df = df.loc[state_IDs == state_ID]
			
 
				+
			
 
				+        # sort entries by state
			
 
				+        df = df.sort_values('Refdatum')
			
 
				+        df = df.reset_index(drop=True)
			
 
				+
			
 
				+        # collect cases    
			
 
				+        entry_idx = 0
			
 
				+        day = 0
			
 
				+        date = df['Refdatum'][entry_idx]
			
 
				+        # check for each date all entries
			
 
				+        while day < time_range:
			
 
				+            # use the date sorted characteristic and take all entries with current date
			
 
				+            while df['Refdatum'][entry_idx] == date:
			
 
				+                infections[day] += df['AnzahlFall'][entry_idx]
			
 
				+                deaths[day] += df['AnzahlTodesfall'][entry_idx]
			
 
				+                entry_idx += 1
			
 
				+            # move day index by difference between the current and next date
			
 
				+            day += (pd.to_datetime(df['Refdatum'][entry_idx])-pd.to_datetime(date)).days
			
 
				+            date = df['Refdatum'][entry_idx]
			
 
				+
			
 
				+        recovery_queue = np.zeros(14)
			
 
				+        week_counter = 2
			
 
				+        for i in range(1, time_range):
			
 
				+            recoveries[i] = recovery_queue[0]
			
 
				+
			
 
				+            recovery_queue[:-1] = recovery_queue[1:]
			
 
				+            recovery_queue[-1] = infections[i]
			
 
				+            week_counter -= 1
			
 
				+        
			
 
				+    df = df.drop(df.index[time_range:])
			
 
				+    S, I, R = np.zeros(df.shape[0]), np.zeros(df.shape[0]), np.zeros(df.shape[0])
			
 
				     # generate groups
			
 
				     S[0] = N - infections[0]
			
 
				     I[0] = infections[0]
			
 
				     R[0] = 0
			
 
				-
			
 
				-    for day in range(1, time_range):
			
 
				-        S[day] = S[day-1] - infections[day]
			
 
				-        I[day] = I[day-1] + infections[day] - I[day-1] * alpha
			
 
				-        R[day] = R[day-1] + I[day-1] * alpha
			
 
				-
			
 
				+    if model == 'I':
			
 
				+        for day in range(1, time_range):
			
 
				+            S[day] = S[day-1] - infections[day]
			
 
				+            I[day] = I[day-1] + infections[day] - I[day-1] * alpha
			
 
				+            R[day] = R[day-1] + I[day-1] * alpha
			
 
				+    else:
			
 
				+        for day in range(1, time_range):
			
 
				+            S[day] = S[day-1] - infections[day]
			
 
				+            I[day] = I[day-1] + infections[day] - deaths[day] - recoveries[day]
			
 
				+            R[day] = R[day-1] + deaths[day] + recoveries[day]
			
 
				+            if I[day] < 0:
			
 
				+                I[day] = 0
			
 
				+    
			
 
				     t = np.arange(0, time_range, 1)
			
 
				 
			
 
				     # select, which group is to be outputted
			
@@ -175,12 +125,12 @@ def state_based_data(plotter:Plotter, state_name:str, model='SIR', alpha=1/14, t
 
				     plotter.plot(t, 
			
 
				                  groups, 
			
 
				                  [*model], 
			
 
				-                 state_name.replace(' ', '_').replace('-', '_').replace('ü','ue'), 
			
 
				-                 state_name +' SI', 
			
 
				+                 state_name.replace(' ', '_').replace('-', '_').replace('ü','ue') + f"_{model}" + f"_{int(1/alpha)}", 
			
 
				+                 state_name, 
			
 
				                  (6,6), 
			
 
				                  xlabel='time / days', 
			
 
				                  ylabel='amount of people')
			
 
				 
			
 
				     COVID_Data = np.asarray([t[0::sample_rate]] + [group[0::sample_rate] for group in groups]) 
			
 
				 
			
 
				-    np.savetxt(f"datasets/{model}_RKI_{state_name.replace(' ', '_').replace('-', '_').replace('ü','ue')}_{sample_rate}.csv", COVID_Data, delimiter=",")
			
 
				+    np.savetxt(f"datasets/{model}_RKI_{state_name.replace(' ', '_').replace('-', '_').replace('ü','ue')}_{sample_rate}_{int(1/alpha)}.csv", COVID_Data, delimiter=",")