Source code for epios.data_process

import numpy as np
import pandas as pd
import math
import json



[docs]
class DataProcess():
    """
    Can process data depending on different modes of samplers.

    When defining an instance, the pre_process part would automatically run.

    This is the base class for different samplers.

    Parameters:
    -----------

    path : str
        The path to store the processed data
    data : pandas.DataFrame
        The dataframe containing geographical data
    num_age_group : int
        This will be used when age stratification is enabled,
        indicating how many age groups are there.

        *The last group includes age >= some threshold*
    age_group_width : int
        This will be used when age stratification is enabled
        indicating the width of each age group (except for the last group)
    mode : str
        This indicates the specific mode to process the data
        This should be the name of the modes that can be identified

    Attributes:
    -----------

    gen_ageinfo : bool
        Whether generating age information
    gen_geoinfo : bool
        Whether generating demographical information
    data : pandas.DataFrame
        The demographical data from EpiABM
    """

    def __init__(self, data: pd.DataFrame, path: str = './input/', num_age_group=None, age_group_width=None, mode=None):
        self.gen_ageinfo = False
        self.gen_geoinfo = False
        if mode == 'AgeRegion':
            self.gen_ageinfo = True
            self.gen_geoinfo = True
        elif mode == 'Age':
            self.gen_ageinfo = True
        elif mode == 'Region':
            self.gen_geoinfo = True
        self.data = data
        self.pre_process(path=path, num_age_group=num_age_group, age_group_width=age_group_width)


[docs]
    def pre_process(self, path='./input/', num_age_group=None, age_group_width=None):
        """
        Take the geographical DataFrame then convert the data into files that Sampler classes can use

        Parameters:
        -----------

        (See explanation for the class above)

        Output:
        -------

        Will write three files (depending on the mode of processing chosen) into the given path.

        The first one is data.csv, contains the data for each person.

        The second one is microcells.csv, contains the geographical information.

        The third one is pop_dist.json, contains a list of age distribution across the population.
        """
        df = self.data
        if self.gen_ageinfo and self.gen_geoinfo:
            # Both age and region stratification is needed
            population_info = pd.DataFrame(columns=['id', 'age', 'cell', 'microcell', 'household'])
            household_info = {}
            population_size = len(df)
            count_age = [0] * num_age_group
            for index, row in df.iterrows():
                ind_age = math.floor(row['age'] / age_group_width)
                if ind_age < num_age_group - 1:
                    count_age[ind_age] += 1
                else:
                    count_age[-1] += 1
                person_id = row['id']
                splitted_id = [int(i) for i in person_id.split('.')]
                cell_num, microcell_num, household_num, _ = splitted_id
                # Generation of each row of data.csv file
                new_row = pd.DataFrame({'id': person_id, 'age': row['age'], 'cell': cell_num,
                                        'microcell': microcell_num, 'household': household_num}, index=[0])
                population_info = pd.concat([population_info, new_row], ignore_index=True)
                key = '.'.join([str(i) for i in splitted_id[:-1]])
                try:
                    household_info[key] += 1
                except KeyError:
                    household_info[key] = 1
            population_info.to_csv(path + 'data.csv', index=False)

            household_df = pd.DataFrame(columns=['cell', 'microcell', 'household', 'Susceptible'])
            for key, value in household_info.items():
                splitted_id = [int(i) for i in key.split('.')]
                cell_num, microcell_num, household_num = splitted_id
                # Generation of each row of microcells.csv file
                new_row = pd.DataFrame({'cell': cell_num, 'microcell': microcell_num,
                                        'household': household_num, 'Susceptible': value}, index=[0])
                household_df = pd.concat([household_df, new_row], ignore_index=True)
            household_df.to_csv(path + 'microcells.csv', index=False)

            # Generation of pop_dist.json file
            age_dist = list(np.array(count_age) / population_size)
            json_string = json.dumps(age_dist)
            with open(path + 'pop_dist.json', 'w') as f:
                f.write(json_string)
        elif self.gen_ageinfo and (~self.gen_geoinfo):
            # Only age stratification needed
            df.to_csv(path + 'data.csv', index=False)
            population_size = len(df)
            count_age = [0] * num_age_group
            for index, row in df.iterrows():
                ind_age = math.floor(row['age'] / age_group_width)
                if ind_age < num_age_group - 1:
                    count_age[ind_age] += 1
                else:
                    count_age[-1] += 1
            age_dist = list(np.array(count_age) / population_size)
            json_string = json.dumps(age_dist)
            with open(path + 'pop_dist.json', 'w') as f:
                f.write(json_string)
        elif self.gen_geoinfo and (~self.gen_ageinfo):
            # Only region stratification needed
            population_info = pd.DataFrame(columns=['id', 'cell', 'microcell', 'household'])
            household_info = {}
            population_size = len(df)
            for index, row in df.iterrows():
                person_id = row['id']
                splitted_id = [int(i) for i in person_id.split('.')]
                cell_num, microcell_num, household_num, _ = splitted_id
                new_row = pd.DataFrame({'id': person_id, 'cell': cell_num,
                                        'microcell': microcell_num, 'household': household_num}, index=[0])
                population_info = pd.concat([population_info, new_row], ignore_index=True)
                key = '.'.join([str(i) for i in splitted_id[:-1]])
                try:
                    household_info[key] += 1
                except KeyError:
                    household_info[key] = 1
            population_info.to_csv(path + 'data.csv', index=False)

            household_df = pd.DataFrame(columns=['cell', 'microcell', 'household', 'Susceptible'])
            for key, value in household_info.items():
                splitted_id = [int(i) for i in key.split('.')]
                cell_num, microcell_num, household_num = splitted_id
                new_row = pd.DataFrame({'cell': cell_num, 'microcell': microcell_num,
                                        'household': household_num, 'Susceptible': value}, index=[0])
                household_df = pd.concat([household_df, new_row], ignore_index=True)
            household_df.to_csv(path + 'microcells.csv', index=False)
        elif (~self.gen_geoinfo) and (~self.gen_ageinfo):
            # Neither of age and region stratification needed
            df.to_csv(path + 'data.csv', index=False)