Source code for epios.sampler_age_region

from epios import Sampler
import pandas as pd
import numpy as np
import json
import math
# from gurobipy import Model, GRB, quicksum



[docs]
class SamplerAgeRegion(Sampler):
    '''
    The sampling class with age and region stratification.

    Parameters:
    -----------

    If you want to input new data, you can input that into data argument and set the pre_process to True
    If you want to use previous processed data, you can input the data_store_path to read data files,
    and set the pre_process to False.

    num_age_group : int
        Indicating how many age groups are there.

        *The last group includes age >= some threshold*
    age_group_width : int
        Indicating the width of each age group (except for the last group)
    mode : str
        This indicates the specific mode to process the data.
        This should be the name of the modes that can be identified.

        **If you want this class sample as originally designed, do not change this value**

    '''

    def __init__(self, data=None, data_store_path='./input/', pre_process=True, num_age_group=17, age_group_width=5,
                 mode='AgeRegion'):
        self.mode = mode
        super().__init__(data=data, data_store_path=data_store_path,
                         num_age_group=num_age_group, pre_process=pre_process,
                         age_group_width=age_group_width, mode=self.mode)
        geoinfo_path = data_store_path + 'microcells.csv'
        ageinfo_path = data_store_path + 'pop_dist.json'
        self.geoinfo = pd.read_csv(geoinfo_path)
        self.ageinfo = ageinfo_path
        self.age_group_width = age_group_width


[docs]
    def get_age_dist(self):
        '''
        Read the age distribution from pop_dist.json processed from DataProcess class

        Output:
        -------

        config : list
            A list of floats, with sum 1, length should be the number of age groups

        '''
        with open(self.ageinfo, 'r') as f:
            config = json.loads(f.read())
        return config



[docs]
    def get_region_dist(self):
        '''
        Extract the geo-distribution from the microcells.csv generated by DataProcess class

        Output:
        -------
        dist : list
            A list of floats, with sum 1, length should be the number of cells

        '''
        df = self.geoinfo
        n = df['Susceptible'].sum()
        dist = []
        for i in range(df['cell'].max() + 1):
            dist.append(df[df['cell'] == i]['Susceptible'].sum() / n)
        return dist


    def bool_exceed(self, current_age: int, current_region: int,
                    current_block: int, cap_age: float, cap_region: float, cap_block: int):
        '''
        Return a boolean value to tell whether the sampling is going to exceed any cap
        --------
        Input:
        All inputs should be integers

        Output:
        True - means not reaching the cap
        False - means reaching the cap

        '''
        if current_block + 2 > cap_block:
            return False
        elif current_age + 2 > cap_age:
            return False
        elif current_region + 2 > cap_region:
            return False
        else:
            return True


[docs]
    def multinomial_draw(self, n: int, prob: list):
        '''
        Perform a multinomial draw with caps, it will return a tuple of lists.
        The first output is the number of people that I want to draw from each group, specified by age and region.
        The second output is for convenience of the following sampling function.

        Parameters:
        -----------
        n : int
            The sample size
        prob : list
            List of floats, sum to 1. Length should be number of age groups times number of region groups

        Output:
        -------

        res : list
            A list of integers indicating the number of samples from each age-region group
        res_cap_block : list
            A list of caps for each age-region group

        '''
        # The following block trasform the probability to a list of barriers between 0 and 1
        # So we can use np.rand to generate a random number between 0 and 1 to
        # compare with the barriers to determine which group it is
        df = self.data
        prob = np.array(prob)
        if n > len(df):
            raise ValueError('Sample size should not be greater than population size')

        # The following code generate the cap for each age-region group, since
        # there is a maximum the number of people in one age group in a region
        # The cap list will have shape (number of region, number of age groups)
        cap_block = []
        len_age = len(self.get_age_dist())
        record_age = len_age
        for i in range(len(prob)):
            pos_age = i % len_age
            pos_region = math.floor(i / len_age)
            ite = df[df['cell'] == pos_region]
            if pos_age != len_age - 1:
                ite = ite[ite['age'] >= pos_age * self.age_group_width]
                ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width]
            else:
                ite = ite[ite['age'] >= pos_age * self.age_group_width]
            cap_block.append(len(ite))

        # Pre-process the prob, remove the prob for the age-region group with cap 0
        index_0 = []
        for i in range(len(cap_block)):
            if cap_block[i] == 0:
                index_0.append(i)
        for i in index_0:
            prob_exceed = prob[i]
            prob[i] = 0
            prob = prob / (1 - prob_exceed)

        # Since we do not want too many samples from the same age/region group,
        # so we set a total cap for each age/region
        prob = prob.reshape((-1, len_age))
        cap_age = []
        cap_age_retry1 = []
        cap_region = []
        cap_region_retry1 = []
        for i in range(np.shape(prob)[1]):
            if i != np.shape(prob)[1] - 1:
                ite = df[df['age'] >= i * self.age_group_width]
                ite = ite[ite['age'] < i * self.age_group_width + self.age_group_width]
                max_num_age = len(ite)
                cap_age.append(min(n * prob[:, i].sum() + 0.01 * n + 1, max_num_age))
                cap_age_retry1.append(max_num_age)
            else:
                ite = df[df['age'] >= i * self.age_group_width]
                max_num_age = len(ite)
                cap_age.append(min(n * prob[:, i].sum() + 0.01 * n + 1, max_num_age))
                cap_age_retry1.append(max_num_age)
        cap_age = [cap_age, list(np.arange(len(cap_age)))]
        cap_age_retry1 = [cap_age_retry1, list(np.arange(len(cap_age_retry1)))]
        for i in range(np.shape(prob)[0]):
            max_num_region = self.geoinfo[self.geoinfo['cell'] == i]['Susceptible'].sum()
            cap_region.append(min(n * prob[i, :].sum() + 0.005 * n + 1, max_num_region))
            cap_region_retry1.append(max_num_region)
        cap_region = [cap_region, list(np.arange(len(cap_region)))]
        cap_region_retry1 = [cap_region_retry1, list(np.arange(len(cap_region_retry1)))]
        prob = prob.reshape((1, -1))[0]

        threshold = []
        for i in range(len(prob)):
            try:
                threshold.append(threshold[-1] + prob[i - 1])
            except IndexError:
                threshold.append(0)
        threshold.append(1)
        cap_block = np.array(cap_block).reshape((-1, len_age))
        res_cap_block = cap_block.copy()

        prob_copy = prob.copy()
        threshold_copy = threshold.copy()

        # You can see there are some cap_retry above.
        # The reason to set up this is because:
        # With the cap for rows/columns may lead to a situation
        # where there is no solution to allocate the number of samples
        # under these constraints.
        # When this situation happens, I just loose the constraint
        # to the case which must have a solution and
        # print out a message saying there is going to be a retry.
        retry_number = 0
        retry = True
        while retry_number <= 2 and retry is True:
            retry = False
            retry_number += 1
            if retry_number == 1:
                pass
            elif retry_number == 2:
                prob = prob_copy
                len_age = record_age
                cap_block = res_cap_block
                cap_age = cap_age_retry1
                cap_region = cap_region_retry1
                threshold = threshold_copy
            # Set the age/region/block counter to record whether any cap is reached
            res = [0] * len(prob)
            current_age = [0] * len_age
            current_region = [0] * len(cap_region[0])
            current_block = np.zeros((len(cap_region[0]), len_age), dtype=int)

            # We start the draw from here, we run the following code for each sample
            # to determine which age/region group it is
            for i in range(n):
                rand = np.random.rand()
                j = 0
                while rand >= threshold[j]:
                    j += 1
                # so the program will stop when it first exceed any barrier

                # Locate its position of age/region group
                j += -1
                pos_age = j % len_age
                pos_region = math.floor(j / len_age)

                # Use the above function to test whether it is going to hit the cap
                if self.bool_exceed(current_age[pos_age], current_region[pos_region],
                                    current_block[pos_region, pos_age], cap_age[0][pos_age],
                                    cap_region[0][pos_region], cap_block[pos_region, pos_age]):
                    # This means it does not hit the cap
                    res[int(cap_region[1][pos_region] * record_age + cap_age[1][pos_age])] += 1
                    current_age[pos_age] += 1
                    current_region[pos_region] += 1
                    current_block[pos_region, pos_age] += 1
                else:
                    # This means it hits the cap
                    res[int(cap_region[1][pos_region] * record_age + cap_age[1][pos_age])] += 1
                    current_age[pos_age] += 1
                    current_region[pos_region] += 1
                    current_block[pos_region, pos_age] += 1
                    prob = prob.reshape((-1, len_age))

                    # This is testing whether it hits block cap
                    if current_block[pos_region, pos_age] + 1 > cap_block[pos_region, pos_age]:
                        # reduce the corresponding prob to 0, and distribute its prob to the rest of blocks
                        prob_exceed = prob[pos_region, pos_age]
                        if i < n - 1:
                            if prob_exceed == prob.sum():
                                raise KeyError('Probability provided not supported for the sample size')
                        prob[pos_region, pos_age] = 0
                        if i < n - 1:
                            prob = prob / (1 - prob_exceed)
                        prob = prob.reshape((1, -1))[0]
                        # Since the prob changes, need to calculate the threshold again
                        threshold = []
                        for k in range(len(prob)):
                            try:
                                threshold.append(threshold[-1] + prob[k - 1])
                            except IndexError:
                                threshold.append(0)
                        if threshold[-1] < 1:
                            threshold.append(1)
                        prob = prob.reshape((-1, len_age))

                    # Testing whether it hits age cap
                    if current_age[pos_age] + 1 > cap_age[0][pos_age]:
                        # Similarly, reduce all prob for this age group to 0, and re-distribute
                        prob_exceed = prob[:, pos_age].sum()
                        if i < n - 1:
                            if prob_exceed == prob.sum():
                                retry = True
                                if retry_number == 1:
                                    print('Multinomial draw failed for the first time. Retry with loose caps.')
                                    break
                            else:
                                prob = prob / (1 - prob_exceed)
                        prob = np.delete(prob, pos_age, 1)
                        cap_block = np.delete(cap_block, pos_age, 1)
                        current_block = np.delete(current_block, pos_age, 1)
                        len_age += -1
                        prob = prob.reshape((1, -1))[0]
                        cap_age = list(np.delete(np.array(cap_age), pos_age, 1))
                        current_age.pop(pos_age)
                        threshold = []
                        for k in range(len(prob)):
                            try:
                                threshold.append(threshold[-1] + prob[k - 1])
                            except IndexError:
                                threshold.append(0)
                        if len(threshold) > 0:
                            if threshold[-1] < 1:
                                threshold.append(1)
                        if len_age > 0:
                            prob = prob.reshape((-1, len_age))

                    # Testing whether it hits region cap
                    if current_region[pos_region] + 1 > cap_region[0][pos_region]:
                        # Similar to the above
                        prob_exceed = prob[pos_region, :].sum()
                        if i < n - 1:
                            if prob_exceed == prob.sum():
                                retry = True
                                if retry_number == 1:
                                    print('Multinomial draw failed for the first time. Retry with loose caps.')
                                    break
                            else:
                                prob = prob / (1 - prob_exceed)
                        prob = np.delete(prob, pos_region, 0)
                        cap_block = np.delete(cap_block, pos_region, 0)
                        current_block = np.delete(current_block, pos_region, 0)
                        if i < n - 1:
                            prob = prob / (1 - prob_exceed)
                        prob = prob.reshape((1, -1))[0]
                        cap_region = list(np.delete(np.array(cap_region), pos_region, 1))
                        current_region.pop(pos_region)
                        threshold = []
                        for k in range(len(prob)):
                            try:
                                threshold.append(threshold[-1] + prob[k - 1])
                            except IndexError:
                                threshold.append(0)
                        if len(threshold) > 0:
                            if threshold[-1] < 1:
                                threshold.append(1)
                        if len_age > 0:
                            prob = prob.reshape((-1, len_age))
        return res, res_cap_block



[docs]
    def sample(self, sample_size: int, additional_sample: list = None,
               household_criterion=False, household_threshold: int = 3):
        '''
        Given a sample size, and the additional sample, should return a list of people's IDs drawn from the population

        Parameters:
        -----------

        sample_size : int
            The size of sample
        additional_sample : list
            List of integers indicating the number of additional samples drawn from each age-region group
        household_criterion : bool
            Turn on or off the household criterion
        household_threshold : int
            The maximum number of people sampled from one household

        Output:
        -------

        res : list
            A list of ID of the sampled people

        '''
        res = []
        df = self.data

        # Get the age and region data
        age_dist = self.get_age_dist()
        region_dist = self.get_region_dist()

        # Assume age and region are two independent variables, calculate the prob
        # for a people in a specific age-region group
        ar_dist = np.array(age_dist) * np.array(region_dist).reshape((-1, 1))
        ar_dist = ar_dist.reshape((1, -1))[0]

        # We use the multinomial distribution to draw the samples, use the above
        # multinomial_draw function to achieve it
        size = sample_size
        num_sample, cap = self.multinomial_draw(size, ar_dist)
        num_sample = np.array(num_sample)

        # This is for non-responders, we may want some additional samples from due to this
        if additional_sample is None:
            num_sample = num_sample.reshape((len(region_dist), -1))
        else:
            additional_sample = np.array(additional_sample)
            num_sample = num_sample.reshape((len(region_dist), -1)) + additional_sample
            num_sample = num_sample.reshape((1, -1))[0]
            cap = cap.reshape((1, -1))[0]
            num_sample = np.array([min(elements) for elements in zip(cap, num_sample)])
            num_sample = num_sample.reshape((len(region_dist), -1))
        # Additional sample needs to be in the shape of (num_region, num_age)
        # With additional samples, need to be careful for post-processing

        # After we have the information of how many people we should draw from each age-region group,
        # Draw them using np.choice, which means completely at random
        # Then generate a list of IDs of the samples
        for i in range(len(num_sample)):
            for j in range(len(num_sample[0])):
                if j != len(num_sample[0]) - 1:
                    ite = df[df['cell'] == i]
                    ite = ite[ite['age'] >= j * self.age_group_width]
                    ite = ite[ite['age'] < j * self.age_group_width + self.age_group_width]
                else:
                    ite = df[df['cell'] == i]
                    ite = ite[ite['age'] >= j * self.age_group_width]
                ite_sample = list(ite['id'])
                if household_criterion:
                    count = 0
                    while count < num_sample[i, j]:
                        if ite_sample:
                            pass
                        else:
                            raise ValueError('Household threshold is too low, not enough household to generate samples')
                        choice_ind = np.random.choice(np.arange(len(ite_sample)), size=1)[0]
                        choice = ite_sample[choice_ind]
                        if self.person_allowed(res, choice, threshold=household_threshold):
                            res.append(choice)
                            count += 1
                        ite_sample.pop(ite_sample.index(choice))
                else:
                    choice = np.random.choice(np.arange(len(ite_sample)), size=num_sample[i, j], replace=False)
                    for k in choice:
                        res.append(ite_sample[k])
        return res



[docs]
    def additional_nonresponder(self, non_resp_id: list, sampling_percentage=0.1, proportion=0.01, threshold=None):
        '''
        Generate the additional samples according to the non-responder IDs

        Parameters:
        -----------

        non_resp_id : list
            A list containing the non-responder IDs
        sampling_percentage : float, between 0 and 1
            The proportion of additional samples taken from a specific age-regional group
        proportion : float, between 0 and 1
            The proportion of total groups to be sampled additionally
        threshold : NoneType or Int
            The lowest number of age-regional groups to be sampled additionally

        Note: proportion and threshold both determine the number of groups to be sampled additionally,
               but both are depending on how many groups can be sampled additionally

        Output:
        -------

        additional_sample : list of 2D, with dimension (num_region_group, num_age_group)
            A list containing how many additional samples we would like to draw from each age-region group

        '''
        num_age_group = len(self.get_age_dist())
        num_region_group = len(self.get_region_dist())
        df = self.data
        n = num_age_group * num_region_group

        # Transform the non_resp_id to nonRespNum to contain the number of non-responders
        # in each age-region group

        nonRespNum = [0] * (num_age_group * num_region_group)
        for i in non_resp_id:
            age = df[df['id'] == i]['age'].values[0]
            if math.floor(age / self.age_group_width) < num_age_group - 1:
                age_group_pos = math.floor(age / self.age_group_width)
            else:
                age_group_pos = num_age_group - 1
            region_group_pos = df[df['id'] == i]['cell'].values[0]
            pos_nonRespRate = region_group_pos * num_age_group + age_group_pos
            nonRespNum[pos_nonRespRate] += 1

        # Determine the number of groups to be sampled additionally
        if threshold is None:
            num_grp = round(proportion * n)
        else:
            num_grp = max(round(proportion * n), threshold)

        # Determine the position of groups to be resampled
        res = []
        for i in range(num_grp):
            if max(nonRespNum) > 0:
                pos = nonRespNum.index(max(nonRespNum))
                nonRespNum[i] = 0
                res.append([pos % num_age_group, math.floor(pos / num_age_group)])

        # Determine the cap for each age-region groups
        additional_sample = list(np.zeros((num_region_group, num_age_group), dtype=int))
        if res:
            cap_block = []
            for i in range(len(nonRespNum)):
                pos_age = i % num_age_group
                pos_region = math.floor(i / num_age_group)
                ite = df[df['cell'] == pos_region]
                if pos_age != num_age_group - 1:
                    ite = ite[ite['age'] >= pos_age * self.age_group_width]
                    ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width]
                else:
                    ite = ite[ite['age'] >= pos_age * self.age_group_width]
                cap_block.append(len(ite))
            cap_block = np.array(cap_block).reshape((-1, num_age_group))

        # Determine the number of additional samples from the above groups
        for i in res:
            additional_sample[i[1]][i[0]] = round(sampling_percentage * cap_block[i[1], i[0]])
        return additional_sample



    # def optimise_draw(self, sample_size: int):
    #     '''
    #     This function use package gurobipy to solve a MIQP problem to replace the multinomial_draw
    #     function above to generate the optimal number of people drawn from each age-region group
    #     ----------
    #     Prerequisites:
    #     gurobipy package and an academic license required!
    #     To install this package, you need to firstly register a free academic account on Guroby
    #     website, and submit a request for a free license
    #     Secondly, you can pip install gurobipy, then run the command provided when applying the license
    #     to install that license. Then you finish the setup!

    #     Input:
    #     sample_size(int): the size of sample

    #     Output:
    #     res(list): If an optimal solution is found:
    #                An 1D list of numbers, with length number of age groups * number of region groups,
    #                indicating the number of people should be drawn from each age-region group
    #                Otherwise:
    #                Raise an ValueError, please go and check your sample size!

    #     '''
    #     # Setup the matrix Q in the MIQP problem
    #     num_age = len(self.get_age_dist())
    #     num_region = len(self.get_region_dist())
    #     Q = np.zeros((num_age * num_region, num_age * num_region))
    #     for i in range(len(Q)):
    #         pos_age = i % num_age
    #         pos_region = math.floor(i / num_age)
    #         for j in range(num_age):
    #             Q[i, pos_region * num_age + j] = 1
    #         for j in range(num_region):
    #             Q[i, pos_age + j * num_age] = 1
    #     Q = list(Q)

    #     # Setup the vector c in the MIQP problem
    #     age_dist = self.get_age_dist()
    #     region_dist = self.get_region_dist()
    #     c = [0] * (num_age * num_region)
    #     for i in range(num_region * num_age):
    #         pos_age = i % num_age
    #         pos_region = math.floor(i / num_age)
    #         c[i] = -2 * sample_size * (age_dist[pos_age] + region_dist[pos_region])

    #     # Setup the constraint to keep the number picked for each age-region group
    #     # does not exceed the existed number of people within that group
    #     cap_block = []
    #     num_age = len(self.get_age_dist())
    #     for i in range(num_age * num_region):
    #         pos_age = i % num_age
    #         pos_region = math.floor(i / num_age)
    #         ite = self.data[self.data['cell'] == pos_region]
    #         if pos_age != num_age - 1:
    #             ite = ite[ite['age'] >= pos_age * self.age_group_width]
    #             ite = ite[ite['age'] < pos_age * self.age_group_width + self.age_group_width]
    #         else:
    #             ite = ite[ite['age'] >= pos_age * self.age_group_width]
    #         cap_block.append(len(ite))
    #     A1_ineq = list(np.eye(num_age * num_region))
    #     b1_ineq = cap_block

    #     # Setup the constraint for cap of age groups
    #     cap_age = []
    #     cap_region = []
    #     for i in range(num_age):
    #         if i != num_age - 1:
    #             ite = self.data[self.data['age'] >= i * self.age_group_width]
    #             ite = ite[ite['age'] < i * self.age_group_width + self.age_group_width]
    #             max_num_age = len(ite)
    #             cap_age.append(min(sample_size * age_dist[i] + 0.01 * sample_size, max_num_age))
    #         else:
    #             ite = self.data[self.data['age'] >= i * self.age_group_width]
    #             max_num_age = len(ite)
    #             cap_age.append(min(max(sample_size * age_dist[i] + 0.01 * sample_size, 1), max_num_age))
    #     for i in range(num_region):
    #         cap_region.append(min(max(sample_size * region_dist[i] + 0.005 * sample_size, 1),
    #                               self.geoinfo[self.geoinfo['cell'] == i]['Susceptible'].sum()))
    #     A2_ineq = np.zeros((num_age, num_age * num_region))
    #     for i in range(num_age):
    #         for j in range(num_region):
    #             A2_ineq[i, i + j * num_age] = 1
    #     A2_ineq = list(A2_ineq)
    #     b2_ineq = cap_age

    #     # Setup the constraint for caps of region groups
    #     A3_ineq = np.zeros((num_region, num_age * num_region))
    #     for i in range(num_region):
    #         for j in range(num_age):
    #             A3_ineq[i, i * num_age + j] = 1
    #     b3_ineq = cap_region

    #     # Make sure that the sum of number of people sampled equals to the sample size
    #     A_eq = list(np.ones((1, num_age * num_region)))
    #     b_eq = [sample_size]

    #     len_c = len(c)

    #     # Construct the model
    #     m = Model('miqp')

    #     x = m.addVars(len(Q), vtype=GRB.INTEGER)

    #     obj = quicksum(quicksum(Q[i][j] * x[i] * x[j] for j in range(len_c)) for i in range(len_c))
    #     obj += quicksum(c[i] * x[i] for i in range(len_c))
    #     m.setObjective(obj, GRB.MINIMIZE)

    #     # Add the constraints defined above
    #     for i in range(len(A1_ineq)):
    #         m.addConstr(quicksum(A1_ineq[i][j] * x[j] for j in range(len_c)) <= b1_ineq[i], "constraint{}".format(i))

    #     for i in range(len(A2_ineq)):
    #         m.addConstr(quicksum(A2_ineq[i][j] * x[j] for j in range(len_c)) <= b2_ineq[i], "constraint{}".format(i))

    #     for i in range(len(A3_ineq)):
    #         m.addConstr(quicksum(A3_ineq[i][j] * x[j] for j in range(len_c)) <= b3_ineq[i], "constraint{}".format(i))

    #     for i in range(len(A_eq)):
    #         m.addConstr(quicksum(A_eq[i][j] * x[j] for j in range(len_c)) == b_eq[i], "constraint{}".format(i))

    #     m.optimize()

    #     # Return the result
    #     if m.status == GRB.Status.OPTIMAL:
    #         res = [i.x for i in m.getVars()]
    #         return res
    #     else:
    #         raise ValueError('No solution exists for these constraints, check sample_size please')