In [1]:
import pandas as pd
from skimage import io
import numpy as np
from PIL import Image
import os
import pickle as pkl
import time
import random
import tqdm
import datetime

In [2]:
images_root = '/home/AMMOD_data/camera_traps/BayerWald/G-Fallen/MDcrops/'
global data_dir_path
data_dir_path = os.path.join(os.getcwd(), '../../data/')
train_stream_file = data_dir_path+'data_stream_files/cv0_expsize128_crop_train_stream.pkl'
test_stream_file = data_dir_path+'data_stream_files/cv0_expsize128_crop_test_stream.pkl'
label_dict_path = data_dir_path+'label_dictionaries/BIRDS_11_Species.pkl'
label_dict_path_cal = data_dir_path+'label_dictionaries/CALTECH_LABEL_DICT.pkl'

with open(label_dict_path, 'rb') as p:
    label_dict = pkl.load(p)
with open(label_dict_path_cal, 'rb') as p:
    label_dict_cal = pkl.load(p)


In [21]:
def filter_bw_data(md_threshold=0.9, only_species=False, crops=False):
    # This is the function that filters the raw data from the 
    # all_data_MD.csv file using the megadetector max cofidence levels and other columns 
    # and returns a dataframe with all data which is then further used to create the cross-validation splits
    
    #all_data_MD_path = data_dir_path+ 'raw_data_csv_files/all_data_MD.csv'
    all_data_MD_path = '/home/boehlke/AMMOD/cam_trap_classification/data/csv_files/all_data_MD.csv'
    all_data_MD = pd.read_csv(all_data_MD_path, low_memory=False)
    all_data_MD = all_data_MD.drop(all_data_MD[all_data_MD.broken==1].index)
    data_w_animal = all_data_MD.drop(all_data_MD[all_data_MD.series_w_animal==0].index)
    data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.label==-1].index)
    data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.MD_category!=1].index)
    data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.MD_max_conf<=md_threshold].index)
    
    #later on we decided to take out these two classes because they had very few instances. 
    data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.label==4].index) #dog
    data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.label==7].index) #badger
    #data_w_animal = data_w_animal.drop(data_w_animal[data_w_animal.label==5].index) #lynx
    #lynx is also a small class but was kept in the current experiments 
    
    # the labels had to be adjusted such that the labels for the used data range from 0 to 10
    data_w_animal.label = data_w_animal.label.replace({12 :7, 11:4})

    
  
    data = data_w_animal
    if not only_species:
        # originaly several experiments were planned investigating how empty images (images without animals)
        # should best be handled
        # in this section sequences without animals in them (based on original annotaions) were filtered
        # this was later ignored as we focised on species classificaion only
        # when using this data the label dict should be edited
        
        
        data_wo_animal = all_data_MD.drop(all_data_MD[all_data_MD.series_w_animal==1].index)
        empty_wo_detection = data_wo_animal.drop(data_wo_animal[data_wo_animal.MD_category!=-1].index)
        empty_w_detection = data_wo_animal.drop(data_wo_animal[data_wo_animal.MD_category==-1].index)
        empty_w_detection = empty_w_detection.drop(empty_w_detection[empty_w_detection.MD_max_conf>=0.5].index)
        empty_data = pd.concat([empty_wo_detection, empty_w_detection], ignore_index=True)
        empty_data['label']=13
        humans = data_wo_animal.drop(data_wo_animal[data_wo_animal.MD_category!=2].index)
        humans = humans.drop(humans[humans.MD_max_conf<=md_threshold].index)
        humans['label']=14
        vehicle = data_wo_animal.drop(data_wo_animal[data_wo_animal.MD_category!=3].index)
        vehicle = vehicle.drop(vehicle[vehicle.MD_max_conf<=md_threshold].index)
        vehicle['label']=15
        #print('vehicle', vehicle.shape, unique_series_count(vehicle))
        data = pd.concat([data_w_animal, empty_data,humans,vehicle ], ignore_index=True)
    
    # only the megadetector bbox with the highest confidence was used in our experiments
    # meaning if multiple individuals were detected by the megadetector only one might be used in the data
    data['crop_file'] =data['file'].str.replace('.JPG', '.JPG___crop00_mdv4.1.jpg')
    
    return data

In [4]:
def unique_series_count(data):
    data_seq_wise = data.drop_duplicates(subset='series_code', ignore_index=True)
    return '  '+str(data_seq_wise.shape[0])

In [5]:
def filter_caltech_data(all_data_MD):
    # this is a similar function to the one above for filtering the caltech cameratrap 
    # dataset based on a csv file created to be contain similar columns as the one for the BW dataset
    
    all_data_MD = all_data_MD.drop(all_data_MD[all_data_MD.confidence<0.9].index)
    all_data_MD = all_data_MD.drop(all_data_MD[all_data_MD.datetime =='11 11'].index)
    all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
    unique_label, label_counts = np.unique(all_data_MD_seq_wise['label'], return_counts=True)
    unique_label = unique_label[label_counts>30]
    all_data_MD = all_data_MD.drop(all_data_MD[np.logical_not(np.isin(all_data_MD.label, unique_label))].index)
    unique_label, label_counts = np.unique(all_data_MD['label'], return_counts=True)

    return all_data_MD


In [6]:
def flattened(list_w_sublists):
    # flattens a list with sublists
    flattened = []
    for item in list_w_sublists:
        if  isinstance(item, list):
            for val in item:
                flattened.append(val)
        else:
            flattened.append(item)
    return flattened

In [7]:
#global series_code_list
#series_code_list = np.array([])
def get_int_series_code(series_code):
    idx = np.argwhere(series_code_list==series_code)
    print(idx)
    if idx==[]:
        idx = series_code_list.shape[0]
        series_code_list = np.concatenate([series_code_list, [series_code]])
        print(series_code_list)
    return idx
    

In [8]:
def get_cv_exp_stream_dict_from_cls_data_dict(cls_data_dict, exp_size=128, cv_splits=5, seed=1, caltech=False):
    
    # this function uses the cls_data_dict with {cls_int_label: cls_dataframe} entries 
    # and creates cross validation splits as defined by the number cv_splits
    # all data is returned in form of a dictionary that has keys in the range(cv_splits) 
    # with the data as values
    
    
    train_split_data_dict = {}
    test_split_data_dict = {}
    val_split_data_dict = {}
    
    for i in range(cv_splits):
        train_split_data_dict[i] =  pd.DataFrame(columns = cls_data_dict[0].columns)
        test_split_data_dict[i] =   pd.DataFrame(columns = cls_data_dict[0].columns)
        val_split_data_dict[i] = pd.DataFrame(columns = cls_data_dict[0].columns)
    
    for label, cls_data in cls_data_dict.items():
        cls_series_codes = np.unique(cls_data.series_code)
        np.random.seed(seed)
        np.random.shuffle(cls_series_codes)
        nr_of_series = cls_series_codes.shape[0]
        nr_series_in_split = int((1/cv_splits)*nr_of_series)
        
        for i in range(cv_splits):
            
            if i==cv_splits-1:
                split_series_codes_val = cls_series_codes[:nr_series_in_split]
                split_series_codes_test = cls_series_codes[-nr_series_in_split:]
                split_series_codes = np.concatenate([split_series_codes_val, split_series_codes_test])

            else:
                split_series_codes = cls_series_codes[i*nr_series_in_split:(i+2)*nr_series_in_split] 
                split_series_codes_test = split_series_codes[:nr_series_in_split]
                split_series_codes_val = split_series_codes[nr_series_in_split:]

            split_test_data = cls_data.drop(cls_data[np.logical_not(cls_data.series_code.isin(split_series_codes_test.tolist()))].index)
            split_val_data = cls_data.drop(cls_data[np.logical_not(cls_data.series_code.isin(split_series_codes_val.tolist()))].index)
            split_train_data = cls_data.drop(cls_data[cls_data.series_code.isin(split_series_codes.tolist())].index)

            train_split_data_dict[i] =  pd.concat([train_split_data_dict[i], split_train_data], ignore_index=True)
            test_split_data_dict[i] =  pd.concat([test_split_data_dict[i], split_test_data], ignore_index=True)
            val_split_data_dict[i] =  pd.concat([val_split_data_dict[i], split_val_data], ignore_index=True)

    total_data = 0

    total_data = train_split_data_dict[i].shape[0]+test_split_data_dict[i].shape[0]+val_split_data_dict[i].shape[0]
    cv_train_streams = {}
    cv_train_streams_crop = {}
    cv_test_data_winter = {}
    cv_val_data_winter = {}
    cv_test_data_crop_winter = {}
    cv_val_data_crop_winter = {}
    
    cv_test_data_summer = {}
    cv_val_data_summer = {}
    cv_test_data_crop_summer = {}
    cv_val_data_crop_summer = {}
    
    cv_summer_exp_list = {}
    cv_winter_exp_list = {}
    summer_start = datetime.datetime.strptime('04-21', '%m-%d')
    summer_end = datetime.datetime.strptime('09-21', '%m-%d')

    for i in range(cv_splits):
        summer_exp_list, winter_exp_list, train_stream, train_stream_crop = get_train_stream(train_split_data_dict[i], exp_size, caltech)
        test_data_winter = []
        val_data_winter = []
        test_data_crop_winter = []
        val_data_crop_winter = []
        test_data_summer = []
        val_data_summer = []
        test_data_crop_summer = []
        val_data_crop_summer = []
        
        for j, row in test_split_data_dict[i].iterrows():
            
            img_path = row.file
            img_path_crop = row.crop_file
            if not caltech: 
                img_path = row.station+'/'+row.file
                img_path_crop = row.station+'/'+row.crop_file
                
            
            tupel = (img_path, row.label, row.series_code)
            tupel_crop = (img_path_crop, row.label, row.series_code)
            
            try:
                row_date = datetime.datetime.strptime(row.datetime[5:10], '%m-%d')
                
                if row_date > summer_start and row_date < summer_end:
                    test_data_summer.append(tupel)
                    test_data_crop_summer.append(tupel_crop)
                else: 
                    test_data_winter.append(tupel)
                    test_data_crop_winter.append(tupel_crop)
                    
            except ValueError: #handles weird case where 29.02 is not accepted as a valid date
                test_data_winter.append(tupel)
                test_data_crop_winter.append(tupel_crop)
       
        
            
        for j, row in val_split_data_dict[i].iterrows():
            img_path = row.file
            img_path_crop = row.crop_file
            if not caltech: 
                img_path = row.station+'/'+row.file
                img_path_crop = row.station+'/'+row.crop_file
                
            tupel = (img_path, row.label, row.series_code)
            tupel_crop = (img_path_crop, row.label, row.series_code)
            try:
                row_date = datetime.datetime.strptime(row.datetime[5:10], '%m-%d')
                if row_date > summer_start and row_date < summer_end:
                    val_data_summer.append(tupel)
                    val_data_crop_summer.append(tupel_crop)
                else: 
                    val_data_winter.append(tupel)
                    val_data_crop_winter.append(tupel_crop)
            
            except ValueError:
                val_data_winter.append(tupel)
                val_data_crop_winter.append(tupel_crop)
       

        cv_train_streams[i] = train_stream 
        cv_train_streams_crop[i] = train_stream_crop

        cv_test_data_summer[i] = test_data_summer 
        cv_val_data_summer[i] = val_data_summer
        cv_test_data_crop_summer[i] = test_data_crop_summer 
        cv_val_data_crop_summer[i] = val_data_crop_summer

        cv_test_data_winter[i] = test_data_winter 
        cv_val_data_winter[i] = val_data_winter
        cv_test_data_crop_winter[i] = test_data_crop_winter 
        cv_val_data_crop_winter[i] = val_data_crop_winter

        cv_summer_exp_list[i]  = summer_exp_list
        cv_winter_exp_list[i] = winter_exp_list
        
    return cv_summer_exp_list, cv_winter_exp_list, cv_train_streams, cv_train_streams_crop, cv_test_data_winter, cv_val_data_winter, cv_test_data_crop_winter, cv_val_data_crop_winter, cv_test_data_summer, cv_val_data_summer, cv_test_data_crop_summer, cv_val_data_crop_summer

In [9]:
def get_train_stream(train_data, exp_size, caltech=False):
    # this function takes all the data in the  pandas dataframe train_data which is usally a subset of the 
    # entire all_data_MD and creats a stream of data, i.e. a list of sublist with experience size 128 
    # which are ordered by their datetime stamp
    # each element in the sublist is a tuple containing the (path_to_image, int_label, seq_id)
    # the returned summer_exp_list and winter_exp_list contain the indexes of sublists that contain either only 
    # summer data or only winter data
    
    train_data = train_data.sort_values('label', ascending=True)
    nr_images_lost = train_data.shape[0]%exp_size
    train_data = train_data.iloc[:-nr_images_lost,:]
    remaining_train_data = train_data
    seq_codes_and_dates = remaining_train_data.drop_duplicates('series_code',keep='first', ignore_index=True)
    seq_codes_and_dates = seq_codes_and_dates.sort_values('datetime')
    sequences = seq_codes_and_dates.series_code.tolist()
    summer_exp_list = []
    winter_exp_list = []
    train_stream = []
    train_stream_crop = []
    exp = []
    exp_crop = []
    summer_falg = 0
    winter_flag = 0
    
    # we defined three months before and after the summer solstice as summer
    summer_start = datetime.datetime.strptime('04-21', '%m-%d')
    summer_end = datetime.datetime.strptime('09-21', '%m-%d')
    for series_code in sequences:
        seq_data = remaining_train_data.drop(remaining_train_data[remaining_train_data.series_code!=series_code].index)
        
        if len(exp)+seq_data.shape[0]<=exp_size:
            for i, row in seq_data.iterrows():
                img_path = row.file
                img_path_crop = row.crop_file
                if not caltech: 
                    img_path = row.station+'/'+row.file
                    img_path_crop = row.station+'/'+row.crop_file
                try:
                   
                    row_date = datetime.datetime.strptime(row.datetime[5:10], '%m-%d')
                    if row_date > summer_start and row_date < summer_end:
                        summer_flag = 1
                    else: 
                        winter_flag = 1
                except ValueError:
                    winter_flag = 1
                tupel = (img_path, row.label, row.series_code)
                tupel_crop = (img_path_crop, row.label, row.series_code)
                exp.append(tupel)
                exp_crop.append(tupel_crop)
                
            if len(exp)==exp_size:
               
                
                if summer_flag*winter_flag == 0:
                    train_stream.append(exp)
                    train_stream_crop.append(exp_crop)
                    exp_nr = len(train_stream)-1
                    if summer_flag == 1:
                        summer_exp_list.append(exp_nr)
                    else:
                        winter_exp_list.append(exp_nr)
                else:
                    print('experience lost')
                    print(exp_nr)
                    print(summer_flag)
                    print(winter_flag)
                    
                exp = []
                exp_crop = []
                summer_flag = 0
                winter_flag = 0
                
        else: 
            needed = exp_size-len(exp)
            for i, row in seq_data.iloc[:needed,:].iterrows():
                img_path = row.file
                img_path_crop = row.crop_file
                if not caltech: 
                    img_path = row.station+'/'+row.file
                    img_path_crop = row.station+'/'+row.crop_file

                try:
                    
                    row_date = datetime.datetime.strptime(row.datetime[5:10], '%m-%d')
                    if row_date > summer_start and row_date < summer_end:
                        summer_flag = 1
                    else: 
                        winter_flag = 1
                except ValueError:
                    winter_flag = 1
                
                tupel = (img_path, row.label, row.series_code)
                tupel_crop = (img_path_crop, row.label, row.series_code)
                exp.append(tupel)
                exp_crop.append(tupel_crop)
            
            if summer_flag*winter_flag == 0:
                train_stream.append(exp)
                train_stream_crop.append(exp_crop)
                exp_nr = len(train_stream)-1
                if summer_flag == 1:
                    summer_exp_list.append(exp_nr)
                else:
                    winter_exp_list.append(exp_nr)
            else:
                #experiences with summer and winter data are removed
                
                print('experience lost')
                print(exp_nr)
                print(summer_flag)
                print(winter_flag)
                
            exp=[]
            exp_crop = []
            summer_flag = 0
            winter_flag = 0
        
            for i, row in seq_data.iloc[needed:,:].iterrows():
                img_path = row.file
                img_path_crop = row.crop_file
                if not caltech: 
                    img_path = row.station+'/'+row.file
                    img_path_crop = row.station+'/'+row.crop_file
                try:
                    row_date = datetime.datetime.strptime(row.datetime[5:10], '%m-%d')
                    if row_date > summer_start and row_date < summer_end:
                        summer_flag = 1
                    else: 
                        winter_flag = 1
                except ValueError:
                    winter_flag = 1
                tupel = (img_path, row.label, row.series_code)
                tupel_crop = (img_path_crop, row.label, row.series_code)
                exp.append(tupel)
                exp_crop.append(tupel_crop)
        remaining_train_data = remaining_train_data.drop(remaining_train_data[remaining_train_data.series_code==series_code].index)
    return summer_exp_list, winter_exp_list, train_stream, train_stream_crop


In [10]:
def get_undersampled_datasets(max_number_of_seqs, all_data, seed=1):
    # creates a dictionary with integer class labels as keys 
    # and a pandas dataframe as with all data for that class as the value
    # this is used to split the data classwise such that the distribution of classes
    # is roughly the same for each train-validation-test data
    
    cls_data_dict ={}
    nr_classes = np.unique(all_data['label']).shape[0]
  
    for i in np.unique(all_data['label']):
        cls_data = all_data.drop(all_data[all_data.label!=i].index)
        seqs = np.unique(cls_data.series_code)
        if seqs.shape[0] <= max_number_of_seqs:
            cls_data_dict[i]=cls_data
        else: 
            np.random.seed(seed)
            selected_seqs = np.random.choice(seqs, max_number_of_seqs, replace=False)
            bool_selector = np.logical_not(cls_data.series_code.isin(list(selected_seqs)))
            cls_data = cls_data.drop(cls_data[bool_selector].index)
            cls_data_dict[i]=cls_data

    full_data = cls_data_dict[0]
    for i in range(1,nr_classes):
        full_data = pd.concat([full_data,cls_data_dict[i] ],ignore_index=True)
    return full_data, cls_data_dict

## Calling functions to create stream files

In [23]:
# Here the five-fold cross validation splits where generated. 
cv_splits=5
exp_size=128

# -- For Caltech Cameratrap Data use this Block ---
dest_path =  data_dir_path+ 'data_stream_files/Caltech_stream_files/'
all_data_MD = pd.read_csv( data_dir_path+ 'raw_data_csv_files/all_data_Caltech.csv')
all_data_MD = filter_bw_data(md_threshold=0.9, only_species=True)
all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
caltech=True

# -- For Bavarian Forest Data use this Block ---
#dest_path =  data_dir_path+ 'data_stream_files/BW_stream_files/'
#all_data_MD = pd.read_csv( data_dir_path+ 'raw_data_csv_files/all_data_MD.csv')
#all_data_MD = filter_bw_data(md_threshold=0.9, only_species=True)
#all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
#caltech=False

                          
nr_cut_off_classes = 0 
# nr of classes that should be undersampled, if nr_cut_off_classes is one for example, 
# then the class with the most images will be undersampled to have as many images as the second largest class
# this was used to create different scenarios with different levels of class imbalance 

uniques = np.unique(all_data_MD_seq_wise['label'], return_counts=True)
counts = uniques[1][np.argsort(uniques[1])]
max_number_of_seqs = counts[-(nr_cut_off_classes+1)]


# all data is split class wise ensuring that the distribution of classes
# is roughly the same for each train-validation-test data 
_, cls_data_dict = get_undersampled_datasets(max_number_of_seqs, all_data_MD)

# all data is split into winter and summer data in order to use this information to evaluate 
# the accuracy when there is a domain shift in form of seasonal changes
# all variables containing train_stream in the name are a list of lists where each element in the 
# sublists is a tupel tuple containing the (path_to_image, int_label, seq_id) information for each image
# the test and validation data are a simple list with these tupel. 
# Further all  data has a normal and a '_crop' version where the paths
# in the paths in the tuple differ, pointing to crops of the original image
# preliminary experiments have shown that using the crops of the images is beneficial to the predictive accuracy
cv_summer_exp_list, cv_winter_exp_list, cv_train_stream, cv_train_stream_crop, cv_test_data_winter, cv_val_data_winter, cv_test_data_crop_winter, cv_val_data_crop_winter, cv_test_data_summer, cv_val_data_summer, cv_test_data_crop_summer, cv_val_data_crop_summer = get_cv_exp_stream_dict_from_cls_data_dict(cls_data_dict, cv_splits=cv_splits, exp_size=exp_size, caltech=caltech)
# set the caltech variable to False wehrn woreking with the Bavarian Forest data (from all_data_MD)

for i in range(cv_splits):
    file_name = 'cv'+str(i)+'_expsize'+str(exp_size)
    file_name2 = 'cv'+str(i)+'_expsize'+str(exp_size*3)
    # we wanted to investigate how the experience size influences continuous learning
    # for this two streams were created, both containing the exact data
    # for the larger experience size stream three experiences were concatenated
    # both training streams correspond to the same validation and test data  
    nr_exp_lost = len(cv_train_stream[i])%3
    if nr_exp_lost !=0:
        cv_train_stream[i] = cv_train_stream[i][:-nr_exp_lost]
        cv_train_stream_crop[i] = cv_train_stream_crop[i][:-nr_exp_lost]
        
    train_stream_larger_exp_size = [cv_train_stream[i][j]+cv_train_stream[i][j+1]+cv_train_stream[i][j+2] for j in range(0, len(cv_train_stream[i])-2, 3)]
    train_stream_larger_exp_size_crop = [cv_train_stream_crop[i][j]+cv_train_stream_crop[i][j+1]+cv_train_stream_crop[i][j+2] for j in range(0, len(cv_train_stream_crop[i])-2, 3)]
    

    test_data_i = cv_test_data_winter[i]+ cv_test_data_summer[i]
    test_data_i_label = np.array(test_data_i)[:,1]
    val_data_i = cv_val_data_winter[i]+ cv_val_data_summer[i]
    val_data_i_label = np.array(val_data_i)[:,1]

    test_data_crop_i = cv_test_data_crop_winter[i]+ cv_test_data_crop_summer[i]
    val_data_crop_i = cv_val_data_crop_winter[i]+ cv_val_data_crop_summer[i]
    season_split = {'summer' : cv_summer_exp_list[i], 'winter': cv_winter_exp_list[i]}



                            
    with open(dest_path+file_name+'_train_stream.pkl', 'wb') as handle:
        pkl.dump(cv_train_stream[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_crop_train_stream.pkl', 'wb') as handle:
        pkl.dump(cv_train_stream_crop[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name2+'_train_stream.pkl', 'wb') as handle:
        pkl.dump(train_stream_larger_exp_size, handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name2+'_crop_train_stream.pkl', 'wb') as handle:
        pkl.dump(train_stream_larger_exp_size_crop, handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_exp_season_split_dict.pkl', 'wb') as handle:
        pkl.dump(season_split, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    with open(dest_path+file_name+'_test_stream.pkl', 'wb') as handle:
        pkl.dump(test_data_i, handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_val_stream.pkl', 'wb') as handle:
        pkl.dump(val_data_i, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    with open(dest_path+file_name+'_crop_test_stream.pkl', 'wb') as handle:
        pkl.dump(test_data_crop_i, handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_crop_val_stream.pkl', 'wb') as handle:
        pkl.dump(val_data_crop_i, handle, protocol=pkl.HIGHEST_PROTOCOL)
        
        
    with open(dest_path+file_name+'_winter_test_stream.pkl', 'wb') as handle:
        pkl.dump(cv_test_data_winter[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_winter_val_stream.pkl', 'wb') as handle:
        pkl.dump(cv_val_data_winter[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
        
    with open(dest_path+file_name+'_summer_test_stream.pkl', 'wb') as handle:
        pkl.dump(cv_test_data_summer[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_summer_val_stream.pkl', 'wb') as handle:
        pkl.dump(cv_val_data_summer[i], handle, protocol=pkl.HIGHEST_PROTOCOL)#
    
    with open(dest_path+file_name+'_winter_crop_test_stream.pkl', 'wb') as handle:
        pkl.dump(cv_test_data_crop_winter[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_winter_crop_val_stream.pkl', 'wb') as handle:
        pkl.dump(cv_val_data_crop_winter[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
        
    with open(dest_path+file_name+'_summer_crop_test_stream.pkl', 'wb') as handle:
        pkl.dump(cv_test_data_crop_summer[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open(dest_path+file_name+'_summer_crop_val_stream.pkl', 'wb') as handle:
        pkl.dump(cv_val_data_crop_summer[i], handle, protocol=pkl.HIGHEST_PROTOCOL)
        

  data['crop_file'] =data['file'].str.replace('.JPG', '.JPG___crop00_mdv4.1.jpg')


experience lost
98
1
1
experience lost
194
1
1
experience lost
94
1
1
experience lost
190
1
1
experience lost
96
1
1
experience lost
190
1
1
experience lost
97
1
1
experience lost
193
1
1
experience lost
100
1
1
experience lost
194
1
1


## Visiualising Distributions

In [None]:
def get_data_dist_given_lablels(label_vector, title='Number of Images per Class where MD is', hline_val=None, label_dict=label_dict):
    all_data_classes = label_vector
    uniques = np.unique(all_data_classes, return_counts=True)
    counts = uniques[1][np.argsort(uniques[1])]
    print(counts)
    label = uniques[0][np.argsort(uniques[1])]
    x_vals = range(0,label.shape[0])
    #x_vals = x_vals[np.argsort(uniques[1])]
    label_list= []
    for count in counts:
        label_list.append(str(count))
    print(label_dict.keys())
    named_label =[]
    for lab in label:
        name = label_dict[lab]
        named_label.append(name)

    barWidth = 0.9
    plt.figure(figsize=(15,10))
    plt.title(title)
    plt.bar(x_vals, counts, width = barWidth)
    if hline_val is not None:
        for val in hline_val:
            plt.axhline(y=val, color='r', linestyle='--')
    plt.xticks([r  for r in range(len(named_label))],named_label , rotation=90)# Text on the top of each bar
    plt.ylabel('Number of Images')
    for i in range(len(label_list)):
        plt.text(x = x_vals[i]-0.25 , y = counts[i]+0.1, s = label_list[i], size = 10)

In [None]:
md_threshold=0.9
all_data_MD = filter_bw_data(md_threshold=md_threshold, only_species=False)
all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
get_data_dist_given_lablels(all_data_MD_seq_wise['label'], title='Number of Images per Class: Non-Species Classes '+ 'MD>'+str(md_threshold), hline_val=[4595,1529,25031])

In [None]:
get_data_dist_given_lablels(all_data_MD_seq_wise['label'], title='Number of Sequences per Class Caltech', label_dict=label_dict_cal)

In [None]:
md_threshold=0.9
all_data_MD = filter_bw_data(md_threshold=md_threshold, only_species=True)
all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
get_data_dist_given_lablels(all_data_MD_seq_wise['label'], title='Number of Sequences per Class: Species Only '+ 'MD>'+str(md_threshold), hline_val=[261,1529,4595])

In [None]:
#distribution of sequence lengths in two datasets

all_data_MD = filter_bw_data(md_threshold=0.9, only_species=True)
all_data_MD_seq_wise = all_data_MD.drop_duplicates(subset='series_code', ignore_index=True)
_, counts = np.unique(all_data_MD.series_code, return_counts=True)
seq_len , seq_len_count = np.unique(counts, return_counts=True)
seq_len_counts_more_than_5 = np.concatenate((seq_len_count[:4], np.sum(seq_len_count[4:]).reshape(1,)))


all_data_MD = pd.read_csv('all_data_Caltech.csv')
print(all_data_MD.shape)
all_data_MD = filter_caltech_seqs(all_data_MD)
_, counts = np.unique(all_data_MD.series_code, return_counts=True)
seq_len , seq_len_count_cal = np.unique(counts, return_counts=True)


fig, axs = plt.subplots(1,2)
axs = axs.flatten()
axs[0].bar(np.arange(1,6), seq_len_count_cal/np.sum(seq_len_count_cal))
axs[1].bar(np.arange(1,6), seq_len_counts_more_than_5/np.sum(seq_len_counts_more_than_5))
axs[0].set_xticks(np.arange(1,6))
axs[1].set_xticks(np.arange(1,6))
axs[1].set_xticklabels(labels=['1', '2', '3', '4', '>5'])
axs[0].set_title('Caltech CT')
axs[1].set_title('Bayerwald')
fig.suptitle('Distribution of Seq Lengths')

In [None]:
#creating new label dict with cls_id -> cls_name as key -> value
new_birds_11_species_dict = {}
for key, val in label_dict.items():
    if val ==12:
        new_birds_11_species_dict[7]=key.replace('_', ' ').title()
    elif val ==11:
        new_birds_11_species_dict[4] = key.replace('_', ' ').title()
    elif val in range(11) and val != 4 and val != 7:
        new_birds_11_species_dict[val] = key.title().replace('_', ' ')
    
new_birds_11_species_dict[1] = 'Bird'
new_birds_11_species_dict.items()
with open('/home/boehlke/AMMOD/cam_trap_classification/data/csv_files/BIRDS_11_Species.pkl', 'wb') as handle:
    pkl.dump(new_birds_11_species_dict, handle, protocol=pkl.HIGHEST_PROTOCOL)