import numpy as np
import pandas as pd
import math
import json
[docs]
class DataProcess():
"""
Can process data depending on different modes of samplers.
When defining an instance, the pre_process part would automatically run.
This is the base class for different samplers.
Parameters:
-----------
path : str
The path to store the processed data
data : pandas.DataFrame
The dataframe containing geographical data
num_age_group : int
This will be used when age stratification is enabled,
indicating how many age groups are there.
*The last group includes age >= some threshold*
age_group_width : int
This will be used when age stratification is enabled
indicating the width of each age group (except for the last group)
mode : str
This indicates the specific mode to process the data
This should be the name of the modes that can be identified
Attributes:
-----------
gen_ageinfo : bool
Whether generating age information
gen_geoinfo : bool
Whether generating demographical information
data : pandas.DataFrame
The demographical data from EpiABM
"""
def __init__(self, data: pd.DataFrame, path: str = './input/', num_age_group=None, age_group_width=None, mode=None):
self.gen_ageinfo = False
self.gen_geoinfo = False
if mode == 'AgeRegion':
self.gen_ageinfo = True
self.gen_geoinfo = True
elif mode == 'Age':
self.gen_ageinfo = True
elif mode == 'Region':
self.gen_geoinfo = True
self.data = data
self.pre_process(path=path, num_age_group=num_age_group, age_group_width=age_group_width)
[docs]
def pre_process(self, path='./input/', num_age_group=None, age_group_width=None):
"""
Take the geographical DataFrame then convert the data into files that Sampler classes can use
Parameters:
-----------
(See explanation for the class above)
Output:
-------
Will write three files (depending on the mode of processing chosen) into the given path.
The first one is data.csv, contains the data for each person.
The second one is microcells.csv, contains the geographical information.
The third one is pop_dist.json, contains a list of age distribution across the population.
"""
df = self.data
if self.gen_ageinfo and self.gen_geoinfo:
# Both age and region stratification is needed
population_info = pd.DataFrame(columns=['id', 'age', 'cell', 'microcell', 'household'])
household_info = {}
population_size = len(df)
count_age = [0] * num_age_group
for index, row in df.iterrows():
ind_age = math.floor(row['age'] / age_group_width)
if ind_age < num_age_group - 1:
count_age[ind_age] += 1
else:
count_age[-1] += 1
person_id = row['id']
splitted_id = [int(i) for i in person_id.split('.')]
cell_num, microcell_num, household_num, _ = splitted_id
# Generation of each row of data.csv file
new_row = pd.DataFrame({'id': person_id, 'age': row['age'], 'cell': cell_num,
'microcell': microcell_num, 'household': household_num}, index=[0])
population_info = pd.concat([population_info, new_row], ignore_index=True)
key = '.'.join([str(i) for i in splitted_id[:-1]])
try:
household_info[key] += 1
except KeyError:
household_info[key] = 1
population_info.to_csv(path + 'data.csv', index=False)
household_df = pd.DataFrame(columns=['cell', 'microcell', 'household', 'Susceptible'])
for key, value in household_info.items():
splitted_id = [int(i) for i in key.split('.')]
cell_num, microcell_num, household_num = splitted_id
# Generation of each row of microcells.csv file
new_row = pd.DataFrame({'cell': cell_num, 'microcell': microcell_num,
'household': household_num, 'Susceptible': value}, index=[0])
household_df = pd.concat([household_df, new_row], ignore_index=True)
household_df.to_csv(path + 'microcells.csv', index=False)
# Generation of pop_dist.json file
age_dist = list(np.array(count_age) / population_size)
json_string = json.dumps(age_dist)
with open(path + 'pop_dist.json', 'w') as f:
f.write(json_string)
elif self.gen_ageinfo and (~self.gen_geoinfo):
# Only age stratification needed
df.to_csv(path + 'data.csv', index=False)
population_size = len(df)
count_age = [0] * num_age_group
for index, row in df.iterrows():
ind_age = math.floor(row['age'] / age_group_width)
if ind_age < num_age_group - 1:
count_age[ind_age] += 1
else:
count_age[-1] += 1
age_dist = list(np.array(count_age) / population_size)
json_string = json.dumps(age_dist)
with open(path + 'pop_dist.json', 'w') as f:
f.write(json_string)
elif self.gen_geoinfo and (~self.gen_ageinfo):
# Only region stratification needed
population_info = pd.DataFrame(columns=['id', 'cell', 'microcell', 'household'])
household_info = {}
population_size = len(df)
for index, row in df.iterrows():
person_id = row['id']
splitted_id = [int(i) for i in person_id.split('.')]
cell_num, microcell_num, household_num, _ = splitted_id
new_row = pd.DataFrame({'id': person_id, 'cell': cell_num,
'microcell': microcell_num, 'household': household_num}, index=[0])
population_info = pd.concat([population_info, new_row], ignore_index=True)
key = '.'.join([str(i) for i in splitted_id[:-1]])
try:
household_info[key] += 1
except KeyError:
household_info[key] = 1
population_info.to_csv(path + 'data.csv', index=False)
household_df = pd.DataFrame(columns=['cell', 'microcell', 'household', 'Susceptible'])
for key, value in household_info.items():
splitted_id = [int(i) for i in key.split('.')]
cell_num, microcell_num, household_num = splitted_id
new_row = pd.DataFrame({'cell': cell_num, 'microcell': microcell_num,
'household': household_num, 'Susceptible': value}, index=[0])
household_df = pd.concat([household_df, new_row], ignore_index=True)
household_df.to_csv(path + 'microcells.csv', index=False)
elif (~self.gen_geoinfo) and (~self.gen_ageinfo):
# Neither of age and region stratification needed
df.to_csv(path + 'data.csv', index=False)