Source code for epios.sampler_age_region

from epios import Sampler
import pandas as pd
import numpy as np
import json
import math
# from gurobipy import Model, GRB, quicksum


[docs] class SamplerAgeRegion(Sampler): ''' The sampling class with age and region stratification. Parameters: ----------- If you want to input new data, you can input that into data argument and set the pre_process to True If you want to use previous processed data, you can input the data_store_path to read data files, and set the pre_process to False. num_age_group : int Indicating how many age groups are there. *The last group includes age >= some threshold* age_group_width : int Indicating the width of each age group (except for the last group) mode : str This indicates the specific mode to process the data. This should be the name of the modes that can be identified. **If you want this class sample as originally designed, do not change this value** ''' def __init__(self, data=None, data_store_path='./input/', pre_process=True, num_age_group=17, age_group_width=5, mode='AgeRegion'): self.mode = mode super().__init__(data=data, data_store_path=data_store_path, num_age_group=num_age_group, pre_process=pre_process, age_group_width=age_group_width, mode=self.mode) geoinfo_path = data_store_path + 'microcells.csv' ageinfo_path = data_store_path + 'pop_dist.json' self.geoinfo = pd.read_csv(geoinfo_path) self.ageinfo = ageinfo_path self.age_group_width = age_group_width
[docs] def get_age_dist(self): ''' Read the age distribution from pop_dist.json processed from DataProcess class Output: ------- config : list A list of floats, with sum 1, length should be the number of age groups ''' with open(self.ageinfo, 'r') as f: config = json.loads(f.read()) return config
[docs] def get_region_dist(self): ''' Extract the geo-distribution from the microcells.csv generated by DataProcess class Output: ------- dist : list A list of floats, with sum 1, length should be the number of cells ''' df = self.geoinfo n = df['Susceptible'].sum() dist = [] for i in range(df['cell'].max() + 1): dist.append(df[df['cell'] == i]['Susceptible'].sum() / n) return dist
def bool_exceed(self, current_age: int, current_region: int, current_block: int, cap_age: float, cap_region: float, cap_block: int): ''' Return a boolean value to tell whether the sampling is going to exceed any cap -------- Input: All inputs should be integers Output: True - means not reaching the cap False - means reaching the cap ''' if current_block + 2 > cap_block: return False elif current_age + 2 > cap_age: return False elif current_region + 2 > cap_region: return False else: return True
[docs] def multinomial_draw(self, n: int, prob: list): ''' Perform a multinomial draw with caps, it will return a tuple of lists. The first output is the number of people that I want to draw from each group, specified by age and region. The second output is for convenience of the following sampling function. Parameters: ----------- n : int The sample size prob : list List of floats, sum to 1. Length should be number of age groups times number of region groups Output: ------- res : list A list of integers indicating the number of samples from each age-region group res_cap_block : list A list of caps for each age-region group ''' # The following block trasform the probability to a list of barriers between 0 and 1 # So we can use np.rand to generate a random number between 0 and 1 to # compare with the barriers to determine which group it is df = self.data prob = np.array(prob) if n > len(df): raise ValueError('Sample size should not be greater than population size') # The following code generate the cap for each age-region group, since # there is a maximum the number of people in one age group in a region # The cap list will have shape (number of region, number of age groups) cap_block = [] len_age = len(self.get_age_dist()) record_age = len_age for i in range(len(prob)): pos_age = i % len_age pos_region = math.floor(i / len_age) ite = df[df['cell'] == pos_region] if pos_age != len_age - 1: ite = ite[ite['age'] >= pos_age * self.age_group_width] ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width] else: ite = ite[ite['age'] >= pos_age * self.age_group_width] cap_block.append(len(ite)) # Pre-process the prob, remove the prob for the age-region group with cap 0 index_0 = [] for i in range(len(cap_block)): if cap_block[i] == 0: index_0.append(i) for i in index_0: prob_exceed = prob[i] prob[i] = 0 prob = prob / (1 - prob_exceed) # Since we do not want too many samples from the same age/region group, # so we set a total cap for each age/region prob = prob.reshape((-1, len_age)) cap_age = [] cap_age_retry1 = [] cap_region = [] cap_region_retry1 = [] for i in range(np.shape(prob)[1]): if i != np.shape(prob)[1] - 1: ite = df[df['age'] >= i * self.age_group_width] ite = ite[ite['age'] < i * self.age_group_width + self.age_group_width] max_num_age = len(ite) cap_age.append(min(n * prob[:, i].sum() + 0.01 * n + 1, max_num_age)) cap_age_retry1.append(max_num_age) else: ite = df[df['age'] >= i * self.age_group_width] max_num_age = len(ite) cap_age.append(min(n * prob[:, i].sum() + 0.01 * n + 1, max_num_age)) cap_age_retry1.append(max_num_age) cap_age = [cap_age, list(np.arange(len(cap_age)))] cap_age_retry1 = [cap_age_retry1, list(np.arange(len(cap_age_retry1)))] for i in range(np.shape(prob)[0]): max_num_region = self.geoinfo[self.geoinfo['cell'] == i]['Susceptible'].sum() cap_region.append(min(n * prob[i, :].sum() + 0.005 * n + 1, max_num_region)) cap_region_retry1.append(max_num_region) cap_region = [cap_region, list(np.arange(len(cap_region)))] cap_region_retry1 = [cap_region_retry1, list(np.arange(len(cap_region_retry1)))] prob = prob.reshape((1, -1))[0] threshold = [] for i in range(len(prob)): try: threshold.append(threshold[-1] + prob[i - 1]) except IndexError: threshold.append(0) threshold.append(1) cap_block = np.array(cap_block).reshape((-1, len_age)) res_cap_block = cap_block.copy() prob_copy = prob.copy() threshold_copy = threshold.copy() # You can see there are some cap_retry above. # The reason to set up this is because: # With the cap for rows/columns may lead to a situation # where there is no solution to allocate the number of samples # under these constraints. # When this situation happens, I just loose the constraint # to the case which must have a solution and # print out a message saying there is going to be a retry. retry_number = 0 retry = True while retry_number <= 2 and retry is True: retry = False retry_number += 1 if retry_number == 1: pass elif retry_number == 2: prob = prob_copy len_age = record_age cap_block = res_cap_block cap_age = cap_age_retry1 cap_region = cap_region_retry1 threshold = threshold_copy # Set the age/region/block counter to record whether any cap is reached res = [0] * len(prob) current_age = [0] * len_age current_region = [0] * len(cap_region[0]) current_block = np.zeros((len(cap_region[0]), len_age), dtype=int) # We start the draw from here, we run the following code for each sample # to determine which age/region group it is for i in range(n): rand = np.random.rand() j = 0 while rand >= threshold[j]: j += 1 # so the program will stop when it first exceed any barrier # Locate its position of age/region group j += -1 pos_age = j % len_age pos_region = math.floor(j / len_age) # Use the above function to test whether it is going to hit the cap if self.bool_exceed(current_age[pos_age], current_region[pos_region], current_block[pos_region, pos_age], cap_age[0][pos_age], cap_region[0][pos_region], cap_block[pos_region, pos_age]): # This means it does not hit the cap res[int(cap_region[1][pos_region] * record_age + cap_age[1][pos_age])] += 1 current_age[pos_age] += 1 current_region[pos_region] += 1 current_block[pos_region, pos_age] += 1 else: # This means it hits the cap res[int(cap_region[1][pos_region] * record_age + cap_age[1][pos_age])] += 1 current_age[pos_age] += 1 current_region[pos_region] += 1 current_block[pos_region, pos_age] += 1 prob = prob.reshape((-1, len_age)) # This is testing whether it hits block cap if current_block[pos_region, pos_age] + 1 > cap_block[pos_region, pos_age]: # reduce the corresponding prob to 0, and distribute its prob to the rest of blocks prob_exceed = prob[pos_region, pos_age] if i < n - 1: if prob_exceed == prob.sum(): raise KeyError('Probability provided not supported for the sample size') prob[pos_region, pos_age] = 0 if i < n - 1: prob = prob / (1 - prob_exceed) prob = prob.reshape((1, -1))[0] # Since the prob changes, need to calculate the threshold again threshold = [] for k in range(len(prob)): try: threshold.append(threshold[-1] + prob[k - 1]) except IndexError: threshold.append(0) if threshold[-1] < 1: threshold.append(1) prob = prob.reshape((-1, len_age)) # Testing whether it hits age cap if current_age[pos_age] + 1 > cap_age[0][pos_age]: # Similarly, reduce all prob for this age group to 0, and re-distribute prob_exceed = prob[:, pos_age].sum() if i < n - 1: if prob_exceed == prob.sum(): retry = True if retry_number == 1: print('Multinomial draw failed for the first time. Retry with loose caps.') break else: prob = prob / (1 - prob_exceed) prob = np.delete(prob, pos_age, 1) cap_block = np.delete(cap_block, pos_age, 1) current_block = np.delete(current_block, pos_age, 1) len_age += -1 prob = prob.reshape((1, -1))[0] cap_age = list(np.delete(np.array(cap_age), pos_age, 1)) current_age.pop(pos_age) threshold = [] for k in range(len(prob)): try: threshold.append(threshold[-1] + prob[k - 1]) except IndexError: threshold.append(0) if len(threshold) > 0: if threshold[-1] < 1: threshold.append(1) if len_age > 0: prob = prob.reshape((-1, len_age)) # Testing whether it hits region cap if current_region[pos_region] + 1 > cap_region[0][pos_region]: # Similar to the above prob_exceed = prob[pos_region, :].sum() if i < n - 1: if prob_exceed == prob.sum(): retry = True if retry_number == 1: print('Multinomial draw failed for the first time. Retry with loose caps.') break else: prob = prob / (1 - prob_exceed) prob = np.delete(prob, pos_region, 0) cap_block = np.delete(cap_block, pos_region, 0) current_block = np.delete(current_block, pos_region, 0) if i < n - 1: prob = prob / (1 - prob_exceed) prob = prob.reshape((1, -1))[0] cap_region = list(np.delete(np.array(cap_region), pos_region, 1)) current_region.pop(pos_region) threshold = [] for k in range(len(prob)): try: threshold.append(threshold[-1] + prob[k - 1]) except IndexError: threshold.append(0) if len(threshold) > 0: if threshold[-1] < 1: threshold.append(1) if len_age > 0: prob = prob.reshape((-1, len_age)) return res, res_cap_block
[docs] def sample(self, sample_size: int, additional_sample: list = None, household_criterion=False, household_threshold: int = 3): ''' Given a sample size, and the additional sample, should return a list of people's IDs drawn from the population Parameters: ----------- sample_size : int The size of sample additional_sample : list List of integers indicating the number of additional samples drawn from each age-region group household_criterion : bool Turn on or off the household criterion household_threshold : int The maximum number of people sampled from one household Output: ------- res : list A list of ID of the sampled people ''' res = [] df = self.data # Get the age and region data age_dist = self.get_age_dist() region_dist = self.get_region_dist() # Assume age and region are two independent variables, calculate the prob # for a people in a specific age-region group ar_dist = np.array(age_dist) * np.array(region_dist).reshape((-1, 1)) ar_dist = ar_dist.reshape((1, -1))[0] # We use the multinomial distribution to draw the samples, use the above # multinomial_draw function to achieve it size = sample_size num_sample, cap = self.multinomial_draw(size, ar_dist) num_sample = np.array(num_sample) # This is for non-responders, we may want some additional samples from due to this if additional_sample is None: num_sample = num_sample.reshape((len(region_dist), -1)) else: additional_sample = np.array(additional_sample) num_sample = num_sample.reshape((len(region_dist), -1)) + additional_sample num_sample = num_sample.reshape((1, -1))[0] cap = cap.reshape((1, -1))[0] num_sample = np.array([min(elements) for elements in zip(cap, num_sample)]) num_sample = num_sample.reshape((len(region_dist), -1)) # Additional sample needs to be in the shape of (num_region, num_age) # With additional samples, need to be careful for post-processing # After we have the information of how many people we should draw from each age-region group, # Draw them using np.choice, which means completely at random # Then generate a list of IDs of the samples for i in range(len(num_sample)): for j in range(len(num_sample[0])): if j != len(num_sample[0]) - 1: ite = df[df['cell'] == i] ite = ite[ite['age'] >= j * self.age_group_width] ite = ite[ite['age'] < j * self.age_group_width + self.age_group_width] else: ite = df[df['cell'] == i] ite = ite[ite['age'] >= j * self.age_group_width] ite_sample = list(ite['id']) if household_criterion: count = 0 while count < num_sample[i, j]: if ite_sample: pass else: raise ValueError('Household threshold is too low, not enough household to generate samples') choice_ind = np.random.choice(np.arange(len(ite_sample)), size=1)[0] choice = ite_sample[choice_ind] if self.person_allowed(res, choice, threshold=household_threshold): res.append(choice) count += 1 ite_sample.pop(ite_sample.index(choice)) else: choice = np.random.choice(np.arange(len(ite_sample)), size=num_sample[i, j], replace=False) for k in choice: res.append(ite_sample[k]) return res
[docs] def additional_nonresponder(self, non_resp_id: list, sampling_percentage=0.1, proportion=0.01, threshold=None): ''' Generate the additional samples according to the non-responder IDs Parameters: ----------- non_resp_id : list A list containing the non-responder IDs sampling_percentage : float, between 0 and 1 The proportion of additional samples taken from a specific age-regional group proportion : float, between 0 and 1 The proportion of total groups to be sampled additionally threshold : NoneType or Int The lowest number of age-regional groups to be sampled additionally Note: proportion and threshold both determine the number of groups to be sampled additionally, but both are depending on how many groups can be sampled additionally Output: ------- additional_sample : list of 2D, with dimension (num_region_group, num_age_group) A list containing how many additional samples we would like to draw from each age-region group ''' num_age_group = len(self.get_age_dist()) num_region_group = len(self.get_region_dist()) df = self.data n = num_age_group * num_region_group # Transform the non_resp_id to nonRespNum to contain the number of non-responders # in each age-region group nonRespNum = [0] * (num_age_group * num_region_group) for i in non_resp_id: age = df[df['id'] == i]['age'].values[0] if math.floor(age / self.age_group_width) < num_age_group - 1: age_group_pos = math.floor(age / self.age_group_width) else: age_group_pos = num_age_group - 1 region_group_pos = df[df['id'] == i]['cell'].values[0] pos_nonRespRate = region_group_pos * num_age_group + age_group_pos nonRespNum[pos_nonRespRate] += 1 # Determine the number of groups to be sampled additionally if threshold is None: num_grp = round(proportion * n) else: num_grp = max(round(proportion * n), threshold) # Determine the position of groups to be resampled res = [] for i in range(num_grp): if max(nonRespNum) > 0: pos = nonRespNum.index(max(nonRespNum)) nonRespNum[i] = 0 res.append([pos % num_age_group, math.floor(pos / num_age_group)]) # Determine the cap for each age-region groups additional_sample = list(np.zeros((num_region_group, num_age_group), dtype=int)) if res: cap_block = [] for i in range(len(nonRespNum)): pos_age = i % num_age_group pos_region = math.floor(i / num_age_group) ite = df[df['cell'] == pos_region] if pos_age != num_age_group - 1: ite = ite[ite['age'] >= pos_age * self.age_group_width] ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width] else: ite = ite[ite['age'] >= pos_age * self.age_group_width] cap_block.append(len(ite)) cap_block = np.array(cap_block).reshape((-1, num_age_group)) # Determine the number of additional samples from the above groups for i in res: additional_sample[i[1]][i[0]] = round(sampling_percentage * cap_block[i[1], i[0]]) return additional_sample
# def optimise_draw(self, sample_size: int): # ''' # This function use package gurobipy to solve a MIQP problem to replace the multinomial_draw # function above to generate the optimal number of people drawn from each age-region group # ---------- # Prerequisites: # gurobipy package and an academic license required! # To install this package, you need to firstly register a free academic account on Guroby # website, and submit a request for a free license # Secondly, you can pip install gurobipy, then run the command provided when applying the license # to install that license. Then you finish the setup! # Input: # sample_size(int): the size of sample # Output: # res(list): If an optimal solution is found: # An 1D list of numbers, with length number of age groups * number of region groups, # indicating the number of people should be drawn from each age-region group # Otherwise: # Raise an ValueError, please go and check your sample size! # ''' # # Setup the matrix Q in the MIQP problem # num_age = len(self.get_age_dist()) # num_region = len(self.get_region_dist()) # Q = np.zeros((num_age * num_region, num_age * num_region)) # for i in range(len(Q)): # pos_age = i % num_age # pos_region = math.floor(i / num_age) # for j in range(num_age): # Q[i, pos_region * num_age + j] = 1 # for j in range(num_region): # Q[i, pos_age + j * num_age] = 1 # Q = list(Q) # # Setup the vector c in the MIQP problem # age_dist = self.get_age_dist() # region_dist = self.get_region_dist() # c = [0] * (num_age * num_region) # for i in range(num_region * num_age): # pos_age = i % num_age # pos_region = math.floor(i / num_age) # c[i] = -2 * sample_size * (age_dist[pos_age] + region_dist[pos_region]) # # Setup the constraint to keep the number picked for each age-region group # # does not exceed the existed number of people within that group # cap_block = [] # num_age = len(self.get_age_dist()) # for i in range(num_age * num_region): # pos_age = i % num_age # pos_region = math.floor(i / num_age) # ite = self.data[self.data['cell'] == pos_region] # if pos_age != num_age - 1: # ite = ite[ite['age'] >= pos_age * self.age_group_width] # ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width] # else: # ite = ite[ite['age'] >= pos_age * self.age_group_width] # cap_block.append(len(ite)) # A1_ineq = list(np.eye(num_age * num_region)) # b1_ineq = cap_block # # Setup the constraint for cap of age groups # cap_age = [] # cap_region = [] # for i in range(num_age): # if i != num_age - 1: # ite = self.data[self.data['age'] >= i * self.age_group_width] # ite = ite[ite['age'] < i * self.age_group_width + self.age_group_width] # max_num_age = len(ite) # cap_age.append(min(sample_size * age_dist[i] + 0.01 * sample_size, max_num_age)) # else: # ite = self.data[self.data['age'] >= i * self.age_group_width] # max_num_age = len(ite) # cap_age.append(min(max(sample_size * age_dist[i] + 0.01 * sample_size, 1), max_num_age)) # for i in range(num_region): # cap_region.append(min(max(sample_size * region_dist[i] + 0.005 * sample_size, 1), # self.geoinfo[self.geoinfo['cell'] == i]['Susceptible'].sum())) # A2_ineq = np.zeros((num_age, num_age * num_region)) # for i in range(num_age): # for j in range(num_region): # A2_ineq[i, i + j * num_age] = 1 # A2_ineq = list(A2_ineq) # b2_ineq = cap_age # # Setup the constraint for caps of region groups # A3_ineq = np.zeros((num_region, num_age * num_region)) # for i in range(num_region): # for j in range(num_age): # A3_ineq[i, i * num_age + j] = 1 # b3_ineq = cap_region # # Make sure that the sum of number of people sampled equals to the sample size # A_eq = list(np.ones((1, num_age * num_region))) # b_eq = [sample_size] # len_c = len(c) # # Construct the model # m = Model('miqp') # x = m.addVars(len(Q), vtype=GRB.INTEGER) # obj = quicksum(quicksum(Q[i][j] * x[i] * x[j] for j in range(len_c)) for i in range(len_c)) # obj += quicksum(c[i] * x[i] for i in range(len_c)) # m.setObjective(obj, GRB.MINIMIZE) # # Add the constraints defined above # for i in range(len(A1_ineq)): # m.addConstr(quicksum(A1_ineq[i][j] * x[j] for j in range(len_c)) <= b1_ineq[i], "constraint{}".format(i)) # for i in range(len(A2_ineq)): # m.addConstr(quicksum(A2_ineq[i][j] * x[j] for j in range(len_c)) <= b2_ineq[i], "constraint{}".format(i)) # for i in range(len(A3_ineq)): # m.addConstr(quicksum(A3_ineq[i][j] * x[j] for j in range(len_c)) <= b3_ineq[i], "constraint{}".format(i)) # for i in range(len(A_eq)): # m.addConstr(quicksum(A_eq[i][j] * x[j] for j in range(len_c)) == b_eq[i], "constraint{}".format(i)) # m.optimize() # # Return the result # if m.status == GRB.Status.OPTIMAL: # res = [i.x for i in m.getVars()] # return res # else: # raise ValueError('No solution exists for these constraints, check sample_size please')