Source code for epios.post_process

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from itertools import product
import multiprocessing
import sys
import os
from epios import Sampler, SamplerAge, SamplerAgeRegion, SamplerRegion, SamplingMaker



[docs]
class PostProcess():
    '''
    This class is to automatically sample the population at several given time points.

    And generate plots and comparison with the true infection level within the population.

    How to use:
    -----------

    Define an instance and input the demographical and time data of the population
    Then use self.predict to generate plots and comparison

    To define an instance of PostProcess, you need the following inputs:

    Parameters:
    -----------

    demo_data : pandas.DataFrame
        The geographical data of the population
    time_data : pandas.DataFrame
        The infection data of the population at different time points

    '''

    def __init__(self, demo_data: pd.DataFrame, time_data: pd.DataFrame):
        self.demo_data = demo_data
        self.time_data = time_data
        self.predict = self.Prediction(demo_data=demo_data, time_data=time_data)


[docs]
    class Prediction():
        '''
        This sub-class is to automatically sample the population at several given time points.

        This sub-class is automatically defined when an instance of PostProcess is defined.

        To use this class, call methods defined under this class to sample and generate plots.

        Parameters:
        -----------

        demo_data : pandas.DataFrame
            The geographical data of the population
        time_data : pandas.DataFrame
            The infection data of the population at different time points

        '''

        def __init__(self, demo_data: pd.DataFrame, time_data: pd.DataFrame):
            '''
            This is to put the information within the PostProcess class into this sub-class.

            This will be automatically run when an instance of PostProcess is defined.
            '''
            self.demo_data = demo_data
            self.time_data = time_data


[docs]
        def AgeRegion(self, sample_size, time_sample, non_responder=False,
                      comparison=True, non_resp_rate=None, sample_strategy='Random',
                      gen_plot: bool = False, saving_path_sampling=None, num_age_group=17,
                      age_group_width=5, data_store_path='./input/', sampling_percentage=0.1,
                      proportion=0.01, threshold=None, seed=None, saving_path_compare=None,
                      scale_method='proportional'):
            '''
            This class is to sample and plot figures using both age and region stratification.

            Parameters:
            -----------

            sample_size : int
                The size of sample
            time_sample : list
                A list of time points to sample the population
            non_responder : bool
                Turn on or off the non-responder function

                Default = False
            non_resp_rate : float between 0 and 1
                The probability that a person does not respond

                Default = None
            comparison : bool
                Turn on or off the comparison between the sampled result and the true result

                Default = True
            sample_strategy : str
                A specific string indicating whether want to change sampled people
                between each sampling

                Strings can be identified: ['Random', 'Same']

                Default = 'Random'
            gen_plot : bool
                Whether or not to generate plots

                Default = False
            saving_path_sampling : str
                The path to save figure showing predicted infection level

                Default = None
            saving_path_compare : str
                The path to save figure showing comparison between predicted
                and true infection level

                Default = None
            num_age_group : int
                Indicating how many age groups are there.

                *The last group includes age >= some threshold*

                Default = 17
            age_group_width : int
                Indicating the width of each age group(except for the last group)

                Default = 5
            scale_method : str
                Specific string telling how to compare the sampled data with the true population

                Default = 'proportional'
            sampling_percentage : float, between 0 and 1
                The proportion of additional samples taken from a specific (age-)regional group

                Default = 0.1 (Only for non-responders)
            proportion : float, between 0 and 1
                The proportion of total groups to be sampled additionally

                Default = 0.01 (Only for non-responders)
            threshold : NoneType or Int
                The lowest number of groups to be sampled additionally

                Default = None (Only for non-responders)
            data_store_path : str
                The path to store data generated during sampling

                Default = ./input/
            seed : int or None
                The seed for random numbers

                Default = None

            '''
            res, diff = self._wrapper_Region_AgeRegion(
                sampling_method='AgeRegion',
                sample_size=sample_size,
                time_sample=time_sample,
                non_responder=non_responder,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                gen_plot=gen_plot,
                saving_path_sampling=saving_path_sampling,
                num_age_group=num_age_group,
                age_group_width=age_group_width,
                data_store_path=data_store_path,
                sampling_percentage=sampling_percentage,
                proportion=proportion,
                threshold=threshold,
                seed=seed,
                saving_path_compare=saving_path_compare,
                scale_method=scale_method
            )
            return res, diff



[docs]
        def Region(self, sample_size, time_sample, non_responder=False,
                   comparison=True, non_resp_rate=None, sample_strategy='Random',
                   gen_plot: bool = False, saving_path_sampling=None,
                   data_store_path='./input/', sampling_percentage=0.1,
                   proportion=0.01, threshold=None, seed=None, saving_path_compare=None,
                   scale_method='proportional'):
            '''
            This class is to sample and plot figures using both age and region stratification.

            Parameters:
            -----------

            sample_size : int
                The size of sample
            time_sample : list
                A list of time points to sample the population
            non_responder : bool
                Turn on or off the non-responder function

                Default = False
            non_resp_rate : float between 0 and 1
                The probability that a person does not respond

                Default = None
            comparison : bool
                Turn on or off the comparison between the sampled result and the true result

                Default = True
            sample_strategy : str
                A specific string indicating whether want to change sampled people
                between each sampling

                Strings can be identified: ['Random', 'Same']

                Default = 'Random'
            gen_plot : bool
                Whether or not to generate plots

                Default = False
            saving_path_sampling : str
                The path to save figure showing predicted infection level

                Default = None
            saving_path_compare : str
                The path to save figure showing comparison between predicted
                and true infection level

                Default = None
            scale_method : str
                Specific string telling how to compare the sampled data with the true population

                Default = 'proportional'
            sampling_percentage : float, between 0 and 1
                The proportion of additional samples taken from a specific (age-)regional group

                Default = 0.1 (Only for non-responders)
            proportion : float, between 0 and 1
                The proportion of total groups to be sampled additionally

                Default = 0.01 (Only for non-responders)
            threshold : NoneType or Int
                The lowest number of groups to be sampled additionally

                Default = None (Only for non-responders)
            data_store_path : str
                The path to store data generated during sampling

                Default = ./input/
            seed : int or None
                The seed for random numbers

                Default = None

            '''
            res, diff = self._wrapper_Region_AgeRegion(
                sampling_method='Region',
                sample_size=sample_size,
                time_sample=time_sample,
                non_responder=non_responder,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                gen_plot=gen_plot,
                saving_path_sampling=saving_path_sampling,
                data_store_path=data_store_path,
                sampling_percentage=sampling_percentage,
                proportion=proportion,
                threshold=threshold,
                seed=seed,
                saving_path_compare=saving_path_compare,
                scale_method=scale_method
            )
            return res, diff



[docs]
        def Age(self, sample_size, time_sample,
                comparison=True, non_resp_rate=None, sample_strategy='Random',
                gen_plot: bool = False, saving_path_sampling=None, num_age_group=17,
                age_group_width=5, data_store_path='./input/',
                seed=None, saving_path_compare=None,
                scale_method='proportional'):
            '''
            This class is to sample and plot figures using both age and region stratification.

            Parameters:
            -----------

            sample_size : int
                The size of sample
            time_sample : list
                A list of time points to sample the population
            comparison : bool
                Turn on or off the comparison between the sampled result and the true result

                Default = True
            sample_strategy : str
                A specific string indicating whether want to change sampled people
                between each sampling

                Strings can be identified: ['Random', 'Same']

                Default = 'Random'
            gen_plot : bool
                Whether or not to generate plots

                Default = False
            saving_path_sampling : str
                The path to save figure showing predicted infection level

                Default = None
            saving_path_compare : str
                The path to save figure showing comparison between predicted
                and true infection level

                Default = None
            num_age_group : int
                Indicating how many age groups are there.

                *The last group includes age >= some threshold*

                Default = 17
            age_group_width : int
                Indicating the width of each age group(except for the last group)

                Default = 5
            scale_method : str
                Specific string telling how to compare the sampled data with the true population

                Default = 'proportional'
            data_store_path : str
                The path to store data generated during sampling

                Default = ./input/
            seed : int or None
                The seed for random numbers

                Default = None

            '''
            res, diff = self._wrapper_Age_Base(
                sampling_method='Age',
                sample_size=sample_size,
                time_sample=time_sample,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                gen_plot=gen_plot,
                saving_path_sampling=saving_path_sampling,
                num_age_group=num_age_group,
                age_group_width=age_group_width,
                data_store_path=data_store_path,
                seed=seed,
                saving_path_compare=saving_path_compare,
                scale_method=scale_method
            )
            return res, diff



[docs]
        def Base(self, sample_size, time_sample,
                 comparison=True, non_resp_rate=None, sample_strategy='Random',
                 gen_plot: bool = False, saving_path_sampling=None, num_age_group=17,
                 age_group_width=5, data_store_path='./input/',
                 seed=None, saving_path_compare=None,
                 scale_method='proportional'):
            '''
            This class is to sample and plot figures using both age and region stratification.

            Parameters:
            -----------

            sample_size : int
                The size of sample
            time_sample : list
                A list of time points to sample the population
            comparison : bool
                Turn on or off the comparison between the sampled result and the true result

                Default = True
            sample_strategy : str
                A specific string indicating whether want to change sampled people
                between each sampling

                Strings can be identified: ['Random', 'Same']

                Default = 'Random'
            gen_plot : bool
                Whether or not to generate plots

                Default = False
            saving_path_sampling : str
                The path to save figure showing predicted infection level

                Default = None
            saving_path_compare : str
                The path to save figure showing comparison between predicted
                and true infection level

                Default = None
            scale_method : str
                Specific string telling how to compare the sampled data with the true population

                Default = 'proportional'
            data_store_path : str
                The path to store data generated during sampling

                Default = ./input/
            seed : int or None
                The seed for random numbers

                Default = None

            '''
            res, diff = self._wrapper_Age_Base(
                sampling_method='Base',
                sample_size=sample_size,
                time_sample=time_sample,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                gen_plot=gen_plot,
                saving_path_sampling=saving_path_sampling,
                num_age_group=num_age_group,
                age_group_width=age_group_width,
                data_store_path=data_store_path,
                seed=seed,
                saving_path_compare=saving_path_compare,
                scale_method=scale_method
            )
            return res, diff


        def _compare(self, time_sample, gen_plot=False, scale_method: str = 'proportional', saving_path_compare=None):
            '''
            Generate a graph comparing the difference between predicted and real infection level
            This method should not be used directly, it is integrated within methods AgeRegion, Age,
            Region and Base.

            '''
            # Based on the input, use different scale method to estimate the true infection number
            if scale_method == 'proportional':
                result_scaled = np.array(self.result) * len(self.demo_data)

            # Get the true result from self.time_data
            true_result = []
            for t in time_sample:
                num = self.time_data.iloc[t, 1:].value_counts().get(3, 0)
                num += self.time_data.iloc[t, 1:].value_counts().get(4, 0)
                num += self.time_data.iloc[t, 1:].value_counts().get(5, 0)
                num += self.time_data.iloc[t, 1:].value_counts().get(6, 0)
                num += self.time_data.iloc[t, 1:].value_counts().get(7, 0)
                num += self.time_data.iloc[t, 1:].value_counts().get(8, 0)
                true_result.append(num)

            # Find the difference between estimated infection level and the real one
            diff = np.array(true_result) - result_scaled
            if gen_plot:
                plt.plot(time_sample, result_scaled, label='Predicted result', linestyle='--')
                plt.plot(time_sample, true_result, label='True result')
                plt.plot(time_sample, np.abs(diff), label='Absolute difference')
                plt.legend()
                plt.xlabel('Time')
                plt.ylabel('Population')
                plt.title('Number of infection in the population')
                if saving_path_compare:
                    plt.savefig(saving_path_compare)
            return diff

        def _wrapper_Region_AgeRegion(self, sampling_method, sample_size, time_sample, non_responder=False,
                                      comparison=True, non_resp_rate=None, sample_strategy='Random',
                                      gen_plot: bool = False, saving_path_sampling=None, num_age_group=17,
                                      age_group_width=5, data_store_path='./input/', sampling_percentage=0.1,
                                      proportion=0.01, threshold=None, seed=None, saving_path_compare=None,
                                      scale_method='proportional'):
            '''
            This is the function really doing work.

            The reason why this wrapper function is set up is to reduce repeated code.

            The Region and AgeRegion methods share very similar code structures.
            '''
            if seed is not None:
                np.random.seed(seed)

            if non_responder:  # For non-responders enabled
                if non_resp_rate is None:
                    raise ValueError('You have to input the non-response rate when considering non-responders')

                # Only Random sample strategies
                infected_rate = []
                for i in range(len(time_sample)):  # Sample again at each time point
                    if i == 0:
                        if sampling_method == 'AgeRegion':
                            sampler_class = SamplerAgeRegion(data=self.demo_data, data_store_path=data_store_path,
                                                             num_age_group=num_age_group,
                                                             age_group_width=age_group_width)
                        else:
                            sampler_class = SamplerRegion(data=self.demo_data, data_store_path=data_store_path)
                    else:
                        if sampling_method == 'AgeRegion':
                            sampler_class = SamplerAgeRegion(data=self.demo_data, data_store_path=data_store_path,
                                                             num_age_group=num_age_group,
                                                             age_group_width=age_group_width)
                        else:
                            sampler_class = SamplerRegion(data=self.demo_data, data_store_path=data_store_path)
                    try:
                        people = sampler_class.sample(sample_size=sample_size, additional_sample=additional_sample)
                    except NameError:
                        people = sampler_class.sample(sample_size=sample_size)

                    # Get the results of people sampled
                    X = SamplingMaker(non_resp_rate=non_resp_rate, keeptrack=True, TheData=self.time_data,
                                      false_positive=0, false_negative=0, threshold=None)
                    ite = X([time_sample[i]], people)

                    # After each sample, now deal with the additional samples
                    try:
                        # For the first time sampled, there is no variable additional sample defined,
                        # therefore, NameError would be raised
                        # Then it will go to the 'except NameError' line
                        additional_sample = np.array(additional_sample)
                        if additional_sample.sum() == 0:
                            # When there is no additional samples needed after one sample,
                            # we will also skip the following bit directly output the infection rate
                            # by raising the NameError
                            raise NameError
                        else:
                            # This means there are additional samples needed

                            # Record the position of groups that need additional samples
                            if sampling_method == 'AgeRegion':
                                indices = np.nonzero(additional_sample)
                                add_pos = []
                                for k in range(len(indices[0])):
                                    add_pos.append((indices[0][k], indices[1][k]))
                            else:
                                add_pos = np.nonzero(additional_sample)[0]

                            # Now, we want to calculate the true infection rate
                            # But since we additionally sampled some people in some groups
                            # This will not be age-region stratification
                            # So we need to generate a robust infection rate according to
                            # The age-regional distribution
                            count_total = 0  # This is the total number of people from groups with addtional samples
                            count_posi = 0  # total number of positive from groups with additional samples
                            other_posi = 0  # total number of positive from groups without additional samples
                            count_nonResp = 0  # total number of non-responders from groups with additional samples
                            other_nonResp = 0  # total number of non-responders from groups without additional samples
                            # Now, we want to get the infection rate for each age-region group
                            # We need to firstly put each people sampled back to their original
                            # age-region group, and then identify whether they are in the group
                            # that with additional samples or not
                            for id in people:
                                if sampling_method == 'AgeRegion':
                                    region_pos = int(id.split('.')[0])
                                    age_value = self.demo_data[self.demo_data['id'] == id]['age'].values[0]
                                    age_pos = min(num_age_group - 1, math.floor(age_value / age_group_width))
                                    indexer = (region_pos, age_pos)
                                else:
                                    indexer = int(id.split('.')[0])
                                if indexer in add_pos:
                                    # If this person is from a group with additional samples
                                    count_total += 1
                                    col_index = ite.columns.get_loc(id)
                                    if ite.iloc[0, col_index] == 'Positive':
                                        count_posi += 1
                                    if ite.iloc[0, col_index] == 'NonResponder':
                                        count_nonResp += 1
                                else:
                                    # If this person is not from a group with additional samples
                                    col_index = ite.columns.get_loc(id)
                                    if ite.iloc[0, col_index] == 'Positive':
                                        other_posi += 1
                                    if ite.iloc[0, col_index] == 'NonResponder':
                                        other_nonResp += 1

                            # The following is total number of people responded from groups with additional samples
                            effective_total = count_total - count_nonResp

                            if effective_total > 0:  # If there is one person respond
                                # We can then calculate the infected rate of groups with additional samples

                                # But do not forget that there are additional samples
                                # To maintain age-region stratification, we need to rescale to
                                # get age-region rescaled number of positive cases
                                spaces = sample_size - (len(people) - count_total)
                                spaces_posi = round(spaces * count_posi / effective_total)

                                # Then add positive cases from other groups(without additional samples)
                                # Then we can calculate this robust infected rate
                                infected_rate.append((spaces_posi + other_posi)
                                                     / (spaces + len(people) - count_total - other_nonResp))
                            else:  # If there is no one in these groups responded
                                try:
                                    # Then try to use other groups' data to be the infected rate
                                    infected_rate.append(other_posi / (len(people) - count_total - other_nonResp))
                                except ZeroDivisionError:
                                    infected_rate.append(np.nan)
                    except NameError:
                        # If additional_sample are not defined or sum = 0, i.e. No additional samples needed
                        # Then directly calculate the infected rate as the output
                        try:
                            infected_rate_ite = (ite.iloc[0].value_counts().get('Positive', 0)
                                                 / (ite.iloc[0].value_counts().get('Positive', 0)
                                                    + ite.iloc[0].value_counts().get('Negative', 0)))
                        except ZeroDivisionError:
                            # There is the possibility that all people do not respond,
                            # so just output nan
                            infected_rate_ite = np.nan
                        infected_rate.append(infected_rate_ite)

                    # After each sample, we need to generate the additional samples for sampling next time
                    # based on the non-responders' IDs of this time's sample
                    non_resp_id = []
                    for j in range(len(ite.columns)):
                        if ite.iloc[0, j] == 'NonResponder':
                            non_resp_id.append(ite.columns[j])
                    additional_sample = sampler_class.additional_nonresponder(non_resp_id=non_resp_id,
                                                                              sampling_percentage=sampling_percentage,
                                                                              proportion=proportion,
                                                                              threshold=threshold)
            else:
                if sample_strategy == 'Same':  # Do not change people sampled at each sample time point
                    infected_rate = []

                    # Do the sampling
                    if sampling_method == 'AgeRegion':
                        sampler_class = SamplerAgeRegion(data=self.demo_data, data_store_path=data_store_path,
                                                         num_age_group=num_age_group, age_group_width=age_group_width)
                    else:
                        sampler_class = SamplerRegion(data=self.demo_data, data_store_path=data_store_path)
                    people = sampler_class.sample(sample_size=sample_size)

                    # Get results of each people sampled
                    X = SamplingMaker(non_resp_rate=0, keeptrack=True, TheData=self.time_data,
                                      false_positive=0, false_negative=0, threshold=None)
                    ite = X(time_sample, people)

                    # Output the infected rate
                    for i in range(len(time_sample)):
                        infected_rate.append(ite.iloc[i].value_counts().get('Positive', 0) / len(people))
                elif sample_strategy == 'Random':  # Change people sampled at each sample time point
                    infected_rate = []
                    for i in range(len(time_sample)):  # Sample at each sample time points
                        if i == 0:  # First time sampling, need pre_process
                            if sampling_method == 'AgeRegion':
                                sampler_class = SamplerAgeRegion(data=self.demo_data,
                                                                 data_store_path=data_store_path,
                                                                 num_age_group=num_age_group,
                                                                 age_group_width=age_group_width)
                            else:
                                sampler_class = SamplerRegion(data=self.demo_data, data_store_path=data_store_path)
                        else:  # After the data process, we can directly read files processed at the first time
                            if sampling_method == 'AgeRegion':
                                sampler_class = SamplerAgeRegion(data=self.demo_data, data_store_path=data_store_path,
                                                                 num_age_group=num_age_group,
                                                                 age_group_width=age_group_width)
                            else:
                                sampler_class = SamplerRegion(data=self.demo_data, data_store_path=data_store_path)
                        people = sampler_class.sample(sample_size=sample_size)

                        # Get the results of each people sampled
                        X = SamplingMaker(non_resp_rate=0, keeptrack=True, TheData=self.time_data,
                                          false_positive=0, false_negative=0, threshold=None)
                        ite = X([time_sample[i]], people)

                        # Output the infected rate
                        infected_rate.append(ite.iloc[0].value_counts().get('Positive', 0) / len(people))

            # Plot the figure
            if gen_plot:
                plt.plot(time_sample, infected_rate)
                plt.xlabel('Time')
                plt.ylabel('Population')
                plt.xlim(0, max(time_sample))
                plt.ylim(0, len(self.demo_data))
                plt.title('Number of infection in the sample')
                if saving_path_sampling:
                    plt.savefig(saving_path_sampling)
            res = []
            res.append(time_sample)
            res.append(infected_rate)
            # Output the results for comparison use
            self.result = infected_rate

            if comparison:
                diff = self._compare(time_sample=time_sample, gen_plot=gen_plot, scale_method=scale_method,
                                     saving_path_compare=saving_path_compare)
                return res, diff
            else:
                return res, None

        def _wrapper_Age_Base(self, sampling_method, sample_size, time_sample,
                              comparison=True, non_resp_rate=None, sample_strategy='Random',
                              gen_plot: bool = False, saving_path_sampling=None, num_age_group=17,
                              age_group_width=5, data_store_path='./input/',
                              seed=None, saving_path_compare=None,
                              scale_method='proportional'):
            '''
            This is the function really doing work.

            The reason why this wrapper function is set up is to reduce repeated code.

            The Age and Base methods share very similar code structures.
            '''
            if seed is not None:
                np.random.seed(seed)

            if sample_strategy == 'Same':  # Do not change people sampled at each sample time point
                infected_rate = []

                # Do the sampling
                if sampling_method == 'Age':
                    sampler_class = SamplerAge(data=self.demo_data, data_store_path=data_store_path,
                                               num_age_group=num_age_group, age_group_width=age_group_width)
                else:
                    sampler_class = Sampler(data=self.demo_data, data_store_path=data_store_path)
                people = sampler_class.sample(sample_size=sample_size)

                # Get results of each people sampled
                X = SamplingMaker(non_resp_rate=0, keeptrack=True, TheData=self.time_data,
                                  false_positive=0, false_negative=0, threshold=None)
                ite = X(time_sample, people)

                # Output the infected rate
                for i in range(len(time_sample)):
                    infected_rate.append(ite.iloc[i].value_counts().get('Positive', 0) / len(people))
            elif sample_strategy == 'Random':  # Change people sampled at each sample time point
                infected_rate = []
                for i in range(len(time_sample)):  # Sample at each sample time points
                    if i == 0:  # First time sampling, need pre_process
                        if sampling_method == 'Age':
                            sampler_class = SamplerAge(data=self.demo_data, data_store_path=data_store_path,
                                                       num_age_group=num_age_group, age_group_width=age_group_width)
                        else:
                            sampler_class = Sampler(data=self.demo_data, data_store_path=data_store_path)
                    else:  # After the data process, we can directly read files processed at the first time
                        if sampling_method == 'Age':
                            sampler_class = SamplerAge(data=self.demo_data, data_store_path=data_store_path,
                                                       num_age_group=num_age_group, age_group_width=age_group_width)
                        else:
                            sampler_class = Sampler(data=self.demo_data, data_store_path=data_store_path)
                    people = sampler_class.sample(sample_size=sample_size)

                    # Get the results of each people sampled
                    X = SamplingMaker(non_resp_rate=0, keeptrack=True, TheData=self.time_data,
                                      false_positive=0, false_negative=0, threshold=None)
                    ite = X([time_sample[i]], people)

                    # Output the infected rate
                    infected_rate.append(ite.iloc[0].value_counts().get('Positive', 0) / len(people))

            # Plot the figure
            if gen_plot:
                plt.plot(time_sample, infected_rate)
                plt.xlabel('Time')
                plt.ylabel('Population')
                plt.xlim(0, max(time_sample))
                plt.ylim(0, len(self.demo_data))
                plt.title('Number of infection in the sample')
                if saving_path_sampling:
                    plt.savefig(saving_path_sampling)
            res = []
            res.append(time_sample)
            res.append(infected_rate)
            # Output the results for comparison use
            self.result = infected_rate

            if comparison:
                diff = self._compare(time_sample=time_sample, gen_plot=gen_plot, scale_method=scale_method,
                                     saving_path_compare=saving_path_compare)
                return res, diff
            else:
                return res, None


    def __call__(self, sampling_method, sample_size, time_sample, non_responder=False, comparison=True,
                 non_resp_rate=None, data_store_path='./input/', **kwargs):
        '''
        This method need not to be called directly.
        This is mainly to be compatible with the following methods _iteration_once and best_method

        Parameters:
        -----------

        sampling_method : str
            A specific string tells which sampling method using

            Methods can be recognised: AgeRegion, Region, Age, Base
        sample_size : int
            The size of sample
        time_sample : list
            A list of time points to sample the population
        non_responder : bool
            Turn on or off the non-responder function
        comparison : bool
            Turn on or off the comparison between the sampled result and the true result
        kwargs : dict
            A dictionary of parameters passed to process part
            The following parameters can be passed:
                num_age_group : int
                    Indicating how many age groups are there.

                    *The last group includes age >= some threshold*

                    Default = 17
                age_group_width : int
                    Indicating the width of each age group(except for the last group)

                    Default = 5
                scale_method : str
                    Specific string telling how to compare the sampled data with the true population

                    Default = 'proportional'
                sampling_percentage : float, between 0 and 1
                    The proportion of additional samples taken from a specific (age-)regional group

                    Default = 0.1 (Only for non-responders)
                proportion : float, between 0 and 1
                    The proportion of total groups to be sampled additionally

                    Default = 0.01 (Only for non-responders)
                threshold : NoneType or Int
                    The lowest number of groups to be sampled additionally

                    Default = None (Only for non-responders)
                seed : int or None
                    The seed for random numbers

                    Default = None

        '''
        # Select all useful variable names provided in kwargs
        total_params = ['num_age_group', 'age_group_width', 'sampling_percentage', 'proportion',
                        'threshold', 'seed', 'scale_method']

        # Pass the parameters for sampling into the function
        sampling_input = {}
        for i in total_params:
            try:
                sampling_input[i] = kwargs[i]
            except KeyError:
                pass
        res, diff = self._sampled_result(sampling_method=sampling_method, sample_size=sample_size,
                                         time_sample=time_sample, non_resp_rate=non_resp_rate,
                                         data_store_path=data_store_path,
                                         comparison=comparison, non_responder=non_responder,
                                         **sampling_input)
        return res, diff

    def _sampled_result(self, sampling_method, sample_size, time_sample, sample_strategy='Random',
                        num_age_group=17, age_group_width=5, data_store_path='./input/', seed=None,
                        non_responder=False, comparison=True, non_resp_rate=None,
                        sampling_percentage=0.1, proportion=0.01, threshold=None,
                        scale_method='proportional'):
        '''
        This is a method to generate the sampled result and plot a figure
        This method should not be used directly, it is integrated within the __callable__ method

        '''
        if sampling_method == 'AgeRegion':  # For both age and region stratification
            res, diff = self.predict.AgeRegion(
                sample_size=sample_size,
                time_sample=time_sample,
                non_responder=non_responder,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                num_age_group=num_age_group,
                age_group_width=age_group_width,
                data_store_path=data_store_path,
                sampling_percentage=sampling_percentage,
                proportion=proportion,
                threshold=threshold,
                seed=seed,
                scale_method=scale_method
            )
        elif sampling_method == 'Age':  # For only age stratification
            res, diff = self.predict.Age(
                sample_size=sample_size,
                time_sample=time_sample,
                comparison=comparison,
                sample_strategy=sample_strategy,
                num_age_group=num_age_group,
                age_group_width=age_group_width,
                data_store_path=data_store_path,
                seed=seed,
                scale_method=scale_method
            )
        elif sampling_method == 'Region':  # Only region stratification
            res, diff = self.predict.Region(
                sample_size=sample_size,
                time_sample=time_sample,
                non_responder=non_responder,
                comparison=comparison,
                non_resp_rate=non_resp_rate,
                sample_strategy=sample_strategy,
                data_store_path=data_store_path,
                sampling_percentage=sampling_percentage,
                proportion=proportion,
                threshold=threshold,
                seed=seed,
                scale_method=scale_method
            )
        elif sampling_method == 'Base':  # Use the base sampling method, no age/regional stratification
            res, diff = self.predict.AgeRegion(
                sample_size=sample_size,
                time_sample=time_sample,
                comparison=comparison,
                sample_strategy=sample_strategy,
                data_store_path=data_store_path,
                seed=seed,
                scale_method=scale_method
            )

        return res, diff

    def _diff_processing(self, diff, metric):
        '''
        Function for transforming the diff into a value according to different metric

        Parameters:
        -----------

        diff : list
            The diff from _compare method
        metric : str
            A specific string specifying the method used to transform

        Output:
        -------

        A float number
        '''
        if metric == 'mean':
            return np.nanmean(np.abs(diff))
        elif metric == 'max':
            return max(np.abs(diff))

    def _iteration_once(
            self,
            sampling_interval,
            total_day_number,
            non_responder,
            hyperparameter_autotune,
            recognised_methods,
            sample_size,
            useful_inputs,
            metric,
            data_store_path=None,
            job_id=None,
            temp_folder_name=None,
            non_resp_rate=None,
            useful_inputs_nonrespRange=None
    ):
        '''
        The function to perform one iteration

        Parameters:
        -----------

        sampling_interval : int
            The number of days between two sample time points
        total_day_number : int
            The total number of days that simulated
        non_responder : bool
            Whether or not to consider non-responders
        hyperparameter_autotune : bool
            Whether or not to autotune the hyperparameters
        recognised_methods : list
            A list of sampling methods that is recognised by 'best_method' method
        sample_size : int
            The size of the sample
        useful_inputs : dict
            A dictionary including all parameters used for sampling
        metric : str
            A specific string indicating the metric used to transform diff to a single value
        job_id : int
            An ID of the current job when multiprocessing is on,
            when this value is None, it means the multiprocessing is off.
            When we turn on multiprocessing, a value will be passed to this parameter.

            Default = None
        temp_folder_name : str
            The name of the folder to store the files generated, it will be cleaned after.
            This is set to None by default, when we need multiprocessing, a value will be
            passed to this parameter

            Default = None
        non_resp_rate : float between 0 and 1
            The possibility of a person to be non-responders.
            When we consider non-responders, a value will be passed to this parameter.

            Default = None
        useful_inputs_nonrespRange : dict
            When hyperparameter tuning is on, and non-responder is on, the 'Region' method requires different input.
            This dictionary include these inputs. When we consider non-responders, a value will be passed
            to this parameter.

            Default = None

        Output:
        -------

        results : list of list
            A list of n lists, where n is the total number of recognised methods.
            Each list within 'results' contain the results of the same method under different sets of parameters.
            *The length of these lists are not the same since the number of combinations of parameters are different.*
        '''
        # Create a temperary folder to put temperary files under the path of __main__ files
        if job_id is not None:
            main_module_path = os.path.abspath(sys.modules['__main__'].__file__)
            dir_name = os.path.dirname(main_module_path)
            data_store_path = os.path.join(dir_name, temp_folder_name, 'job_id_' + str(job_id))
            os.mkdir(data_store_path)
        else:
            data_store_path = data_store_path

        # Firstly define the time points to sample based on sampling-interval
        time_sample = list(np.arange(math.floor(total_day_number / sampling_interval))
                           * sampling_interval)

        # Divide into different cases
        if non_responder is False:
            if hyperparameter_autotune is False:

                # This is the result to output in the end, performance of different methods
                res_across_methods = []
                for method in recognised_methods:

                    # Put the result of the same method into one list
                    result_within_method = []

                    # Split the method name and sample strategy
                    method_string = method.split('-')
                    if method_string[1] == 'Same':
                        input_kwargs = {
                            'sample_strategy': 'Same'
                        }

                        # Pour the inputs in useful_inputs into the dict to input
                        for input in useful_inputs:
                            input_kwargs[input] = useful_inputs[input]

                        # Perform the sampling by __call__ method above
                        _, diff = self(method_string[0], sample_size,
                                       time_sample, data_store_path=data_store_path,
                                       **input_kwargs)

                        # Process the diff according to the metric provided
                        result_within_method.append(self._diff_processing(diff, metric))
                    elif method_string[1] == 'Random':
                        # The following part is almost same as above
                        input_kwargs = {
                            'sample_strategy': 'Random'
                        }
                        for input in useful_inputs:
                            input_kwargs[input] = useful_inputs[input]
                        _, diff = self(method_string[0], sample_size,
                                       time_sample, data_store_path=data_store_path,
                                       **input_kwargs)
                        result_within_method.append(self._diff_processing(diff, metric))

                    # For different methods, we have a list to contain its result,
                    # I do this because there will be different parameter pairs to repeat
                    res_across_methods.append(result_within_method)

                # Output the final result and clean up
                if job_id is not None:
                    self._clean_up(temp_folder_name=temp_folder_name, data_store_path=data_store_path)
                return res_across_methods
            else:

                # This is the code when considering hyperparameter autotuning
                # Use the same structure as above
                # Comment when there is something is different
                res_across_methods = []
                for method in recognised_methods:
                    result_within_method = []
                    method_string = method.split('-')
                    input_kwargs = {
                        'sample_strategy': method_string[1]
                    }

                    # Here, we need to distuiguish between Age-related and
                    # Age-unrelated, since the inputs are different
                    # For age-related, they need the num_age_group and
                    # age_group_width variables
                    if method_string[0] == 'Base' or method_string[0] == 'Region':

                        # Since there is no parameters to vary,
                        # So just like above, directly output
                        # the result
                        _, diff = self(method_string[0], sample_size,
                                       time_sample, data_store_path=data_store_path,
                                       **input_kwargs)
                        result_within_method.append(self._diff_processing(diff, metric))
                    elif method_string[0] == 'Age' or method_string[0] == 'AgeRegion':

                        # Now we have parameters to vary
                        # Firstly we should collect all parameters can vary
                        # And put their ranges into a list
                        all_ranges = []
                        for key in useful_inputs:
                            all_ranges.append(useful_inputs[key])

                        # Use this list to generate all possible combinations
                        # of different parameters
                        all_combinations = list(product(*all_ranges))

                        # For each combination, do a sampling and output result
                        for combination in all_combinations:
                            count = 0
                            for key in useful_inputs:
                                input_kwargs[key[:-6]] = combination[count]
                                count += 1
                            _, diff = self(method_string[0], sample_size,
                                           time_sample, data_store_path=data_store_path,
                                           **input_kwargs)
                            result_within_method.append(self._diff_processing(diff, metric))
                    res_across_methods.append(result_within_method)
                if job_id is not None:
                    self._clean_up(temp_folder_name=temp_folder_name, data_store_path=data_store_path)
                return res_across_methods
        else:

            # The following part is for the case considering non-responders
            # Use the same logic as above
            # Comments when there is something different below
            if hyperparameter_autotune is False:
                res_across_methods = []
                for method in recognised_methods:
                    result_within_method = []
                    method_string = method.split('-')
                    input_kwargs = {}
                    for input in useful_inputs:
                        input_kwargs[input] = useful_inputs[input]
                    _, diff = self(method_string[0], sample_size,
                                   time_sample, non_responder=True,
                                   non_resp_rate=non_resp_rate, data_store_path=data_store_path,
                                   **input_kwargs)
                    result_within_method.append(self._diff_processing(diff, metric))
                    res_across_methods.append(result_within_method)
                if job_id is not None:
                    self._clean_up(temp_folder_name=temp_folder_name, data_store_path=data_store_path)
                return res_across_methods
            else:
                res_across_methods = []
                for method in recognised_methods:
                    result_within_method = []
                    method_string = method.split('-')
                    input_kwargs = {}
                    if method_string[0] == 'Region':

                        # Here has something different
                        # Since Region method does not have the num_age_group
                        # and age_group_width variable to vary,
                        # So we need to use a different useful_input dict
                        # to setup the ranges to generate the combinations
                        all_ranges = []
                        for key in useful_inputs_nonrespRange:
                            all_ranges.append(useful_inputs_nonrespRange[key])
                        all_combinations = list(product(*all_ranges))
                        for combination in all_combinations:
                            count = 0
                            for key in useful_inputs_nonrespRange:
                                input_kwargs[key[:-6]] = combination[count]
                                count += 1
                            _, diff = self(method_string[0], sample_size,
                                           time_sample, non_responder=True,
                                           non_resp_rate=non_resp_rate, data_store_path=data_store_path,
                                           **input_kwargs)
                            result_within_method.append(self._diff_processing(diff, metric))
                    elif method_string[0] == 'AgeRegion':

                        # Here is just the normal case
                        all_ranges = []
                        for key in useful_inputs:
                            all_ranges.append(useful_inputs[key])
                        all_combinations = list(product(*all_ranges))
                        for combination in all_combinations:
                            count = 0
                            for key in useful_inputs:
                                input_kwargs[key[:-6]] = combination[count]
                                count += 1
                            _, diff = self(method_string[0], sample_size,
                                           time_sample, non_responder=True,
                                           non_resp_rate=non_resp_rate, data_store_path=data_store_path,
                                           **input_kwargs)
                            result_within_method.append(self._diff_processing(diff, metric))
                    res_across_methods.append(result_within_method)
                if job_id is not None:
                    self._clean_up(temp_folder_name=temp_folder_name, data_store_path=data_store_path)
                return res_across_methods

    def _clean_up(self, temp_folder_name, data_store_path):
        '''
        This method is to clean up the temporary files generated during multiprocessing.
        This is called within the method '_iteration_once' when multiprocessing is on.

        Parameters:
        -----------

        temp_folder_name : str
            The name of the file to store the files
        data_store_path : str
            The exact path to store the data
        '''
        # This is to locate the path from __main__
        # This part can be unnecessary, but find the path from absolute path
        # is to avoid weird names in some path names
        main_module_path = os.path.abspath(sys.modules['__main__'].__file__)
        dir_name = os.path.dirname(main_module_path)

        # From here, removing the job_id_i folder and its content
        if os.path.exists(os.path.join(dir_name, temp_folder_name)):
            if os.path.exists(data_store_path):
                for file in ['pop_dist.json', 'microcells.csv', 'data.csv']:
                    if os.path.exists(data_store_path + file):
                        os.remove(data_store_path + file)
                os.rmdir(data_store_path)

    def _wrapper_iteration_once(self, kwargs_dict):
        '''
        Since the input variables of _iteration_once may be different, need this function to wrap up these inputs

        Parameters:
        -----------

        kwargs_dict : dict
            A dictionary of inputs of '_iteration_once'
        '''
        return self._iteration_once(**kwargs_dict)


[docs]
    def best_method(self, methods, sample_size, hyperparameter_autotune=False,
                    non_responder=False, non_resp_rate=None, sampling_interval=7,
                    parallel_computation=True, metric='mean', iteration=100,
                    data_store_path='./input/', **kwargs):
        '''
        Print the best method among different methods provided.

        When hyper-parameter autotune is on, will firstly print the best parameter
        combination and its performance of each method, then print the best method
        across all methods.

        The order of best parameter set printed follows the following ordering:
            (
            'num_age_group',
            'age_group_width',
            'sampling_percentage',
            'proportion',
            'threshold'
            )

            Parameter will be omitted if that parameter is not applicable for the
            method.

        Features:
        ---------

        When a range of parameters provided, can automatically tune the hyperparameters

        Can set to consider non-ressponders

        Will print any unrecognised inputs or methods

        Parameters:
        -----------

        methods : list
            A list of strings indicating the methods to compare with each other
            Acceptible methods:
                Use 'Same' strategy:
                    'Age-Same'
                    'Region-Same'
                    'AgeRegion-Same'
                    'Base-Same'
                Use 'Random' strategy:
                    'Age-Random'
                    'Region-Random'
                    'AgeRegion-Random'
                    'Base-Random'
                *Note: When you input the method names without sample strategy, 'Random' will be the default*
        sample_size : int
            The size of sample
        hyperparameter_autotune : bool
            Whether or not to turn on the hyperparameter automatic tuning

            *For extra input, see documentation for parameter 'kwargs' below*
        non_responder : bool
            Whether or not to consider non-responders
        sampling_interval : int
            The number of days between each sampling time points
        metric : str
            The metric used to transform difference between the sampled result and true infection into
            a float to measure the performance.
            Acceptible metric:
                'mean':
                    Use the mean of absolute difference between true and predicted infection.
                    We ignore all nan values
                'max':
                    Use the max of absolute difference between true and predicted infection.
        iteration : int
            The number of iterations to run and average the value of prediction to get
            a robust result
        parallel_computation : bool
            Whether or not to use multiprocessing to speed up this repeated process

            Default = True

            *Note: You cannot directly call this method when this is turned on, see example in documentation*
        data_store_path : str
            The path to store files generated during sampling when parallel computation is disabled

            Default = './input/'

            *This is used only when parallel computation is disabled*
        kwargs : dict
            A dictionary of parameters passed to process part
            The following parameters can be passed:
                num_age_group : int
                    Indicating how many age groups are there.

                    Default = 17

                    *The last group includes age >= some threshold*

                    *This is used when autotuning is turned off*
                age_group_width : int
                    Indicating the width of each age group(except for the last group)

                    Default = 5

                    *This is used when autotuning is turned off*
                sampling_percentage : float, between 0 and 1
                    The proportion of additional samples taken from a specific (age-)regional group

                    Default = 0.1 (Only for non-responders)

                    *This is used when autotuning is turned off*
                proportion : float, between 0 and 1
                    The proportion of total groups to be sampled additionally

                    Default = 0.01 (Only for non-responders)

                    *This is used when autotuning is turned off*
                threshold : NoneType or Int
                    The lowest number of groups to be sampled additionally

                    Default = None (Only for non-responders)

                    *This is used when autotuning is turned off*
                num_age_group_range : list
                    All possible number of age groups that you want to try/iterate over

                    Default = [10, 13, 15, 17, 20]

                    *The last group includes age >= some threshold*

                    *This is used when autotuning is turned on*
                age_group_width_range : list
                    All possible age group width (except for the last group)
                    that you want to try/iterate over

                    Default = [5, 10]

                    *This is used when autotuning is turned on*
                sampling_percentage_range : list
                    All possible proportion of additional samples taken
                    from a specific (age-)regional group that you want to
                    try/iterate over

                    Default = [0.1, 0.2, 0.3] (Only for non-responders)

                    *This is used when autotuning is turned on*
                proportion_range : list
                    All possible proportion of total groups to be sampled additionally
                    that you want to try/iterate over

                    Default = [0.01, 0.05, 0.1] (Only for non-responders)

                    *This is used when autotuning is turned on*
                threshold_range : list
                    All possible lowest number of groups to be sampled additionally
                    that you want to try/iterate over

                    Default = [10, 20, 30] (Only for non-responders)

                    *This is used when autotuning is turned on*
        '''
        # Check whether metric is recognisable
        recognisable_metric = [
            'mean',
            'max'
        ]
        if metric not in recognisable_metric:
            raise ValueError('Metric not recognisable')

        # All recognisable inputs
        recognisable_inputs = [
            'num_age_group',
            'age_group_width',
            'sampling_percentage',
            'proportion',
            'threshold',
            'num_age_group_range',
            'age_group_width_range',
            'sampling_percentage_range',
            'proportion_range',
            'threshold_range'
        ]

        # All parameters used when the autotune function enabled
        inputs_for_autotune_normal = {
            'num_age_group_range': [10, 13, 15, 17, 20],
            'age_group_width_range': [5, 10]
        }
        inputs_for_autotune_nonresp = {
            'num_age_group_range': [10, 13, 15, 17, 20],
            'age_group_width_range': [5, 10],
            'sampling_percentage_range': [0.1, 0.2, 0.3],
            'proportion_range': [0.01, 0.05, 0.1],
            'threshold_range': [10, 20, 30]
        }
        inputs_for_autotune_nonresp_Region = {
            'sampling_percentage_range': [0.1, 0.2, 0.3],
            'proportion_range': [0.01, 0.05, 0.1],
            'threshold_range': [10, 20, 30]
        }

        # All parameters used when the autotune function disabled
        inputs_for_disabled_autotune_normal = [
            'num_age_group',
            'age_group_width'
        ]
        inputs_for_disabled_autotune_nonresp = [
            'num_age_group',
            'age_group_width',
            'sampling_percentage',
            'proportion',
            'threshold'
        ]

        # Parameter Ordering when hyperparameter autotune is on
        inputs_ordering_normal = [
            'num_age_group_range',
            'age_group_width_range'
        ]
        inputs_ordering_nonrespRegion = [
            'sampling_percentage_range',
            'proportion_range',
            'threshold_range'
        ]
        inputs_ordering_nonrespAgeRegion = [
            'num_age_group_range',
            'age_group_width_range',
            'sampling_percentage_range',
            'proportion_range',
            'threshold_range'
        ]

        # Firstly remove all irrecognisable inputs
        recognised_inputs = {}
        irrecognisable_input = []
        for input in kwargs:
            if input in recognisable_inputs:
                recognised_inputs[input] = kwargs[input]
            else:
                irrecognisable_input.append(input)

        # Then selected the useful inputs for the function enabled
        useful_inputs = {}
        if hyperparameter_autotune:
            if non_responder is False:

                # Put all specified recognised inputs into useful inputs
                # Discard the others
                for input in recognised_inputs:
                    if input in inputs_for_autotune_normal:
                        useful_inputs[input] = recognised_inputs[input]
                    else:
                        irrecognisable_input.append(input)

                # When some of the inputs are not specified, use default values stored
                for input in inputs_for_autotune_normal:
                    if input in useful_inputs:
                        pass
                    else:
                        useful_inputs[input] = inputs_for_autotune_normal[input]
            else:

                # Since 'Region' do not need to iterate over age stuffs, define a new
                # dict to contain these variables
                useful_inputs_nonrespRange = {}

                # Put all specified recognised inputs into useful inputs
                # Discard the others
                for input in recognised_inputs:
                    if input in inputs_for_autotune_nonresp:
                        useful_inputs[input] = recognised_inputs[input]
                    else:
                        irrecognisable_input.append(input)

                    # Do the same for Region
                    if input in inputs_for_autotune_nonresp_Region:
                        useful_inputs_nonrespRange[input] = recognised_inputs[input]

                # When some of the inputs are not specified, use default values stored
                for input in inputs_for_autotune_nonresp:
                    if input in useful_inputs:
                        pass
                    else:
                        useful_inputs[input] = inputs_for_autotune_nonresp[input]

                # Do the same for Region
                for input in inputs_for_autotune_nonresp_Region:
                    if input in useful_inputs_nonrespRange:
                        pass
                    else:
                        useful_inputs_nonrespRange[input] = inputs_for_autotune_nonresp_Region[input]
        else:
            if non_responder:  # Since non-responders will need more inputs, divide into two situations
                for input in recognised_inputs:
                    if input in inputs_for_disabled_autotune_nonresp:
                        useful_inputs[input] = recognised_inputs[input]
                    else:
                        irrecognisable_input.append(input)
            else:
                for input in recognised_inputs:
                    if input in inputs_for_disabled_autotune_normal:
                        useful_inputs[input] = recognised_inputs[input]
                    else:
                        irrecognisable_input.append(input)

        # Print the unused inputs
        if irrecognisable_input:
            print_str = 'The following inputs provided are not used: '
            print_str += ', '.join(irrecognisable_input)
            print(print_str)

        total_day_number = len(self.time_data)

        # For methods, need to distinguish recognised ones
        recognisable_methods = [
            'AgeRegion',
            'Age',
            'Region',
            'Base',
            'AgeRegion-Same',
            'Age-Same',
            'Region-Same',
            'Base-Same',
            'AgeRegion-Random',
            'Age-Random',
            'Region-Random',
            'Base-Random'
        ]
        recognised_methods = set()
        irrecognisable = []
        for method in methods:
            if method in recognisable_methods:
                if len(method.split('-')) == 1:
                    recognised_methods.add(method + '-Random')
                else:
                    recognised_methods.add(method)
            else:
                irrecognisable.append(method)

        # If non-responder function enabled, then all 'Same' method will be ignored
        if non_responder:
            for method in recognised_methods.copy():
                if method[-4:] == 'Same':
                    recognised_methods.remove(method)
                    irrecognisable.append(method)
                elif method[:-7] == 'Age' or method[:-7] == 'Base':
                    recognised_methods.remove(method)
                    irrecognisable.append(method)

        # Print all irrecognised methods
        if irrecognisable:
            print_str = 'The following methods provided are not used: '
            for i in irrecognisable:
                print_str += i
                print_str += ', '
            print_str = print_str[:-2]
            print(print_str)

        recognised_methods = list(recognised_methods)
        if len(recognised_methods) == 0:
            raise ValueError('No valid method detected')

        # Prepare the inputs for each iteration
        iteration_inputs = {
            'sampling_interval': sampling_interval,
            'total_day_number': total_day_number,
            'non_responder': non_responder,
            'hyperparameter_autotune': hyperparameter_autotune,
            'recognised_methods': recognised_methods,
            'sample_size': sample_size,
            'useful_inputs': useful_inputs,
            'metric': metric
        }
        if non_responder:
            iteration_inputs['non_resp_rate'] = non_resp_rate
            if hyperparameter_autotune:
                if 'Region-Random' in recognised_methods:
                    iteration_inputs['useful_inputs_nonrespRange'] = useful_inputs_nonrespRange

        # Prepare the folder name to store the temperary data
        if parallel_computation:
            temp_folder_name = 'temp_'
            main_module_path = os.path.abspath(sys.modules['__main__'].__file__)
            dir_name = os.path.dirname(main_module_path)
            while os.path.exists(os.path.join(dir_name, temp_folder_name)):
                temp_folder_name += 'a'
            iteration_inputs['temp_folder_name'] = temp_folder_name
            os.mkdir(os.path.join(dir_name, temp_folder_name))

            # From here, enable multiprocessing
            # Firstly, prepare the input
            multiprocessing_inputs = []
            for i in range(iteration):
                multiprocessing_input = iteration_inputs.copy()
                multiprocessing_input['job_id'] = i
                multiprocessing_inputs.append(multiprocessing_input)
            num_processes = multiprocessing.cpu_count()
            with multiprocessing.Pool(processes=num_processes) as pool:
                # Map the process_item function to the items
                results = pool.map(self._wrapper_iteration_once, multiprocessing_inputs)

            if os.path.exists(os.path.join(dir_name, temp_folder_name)):
                os.rmdir(os.path.join(dir_name, temp_folder_name))
        else:
            # Here is when the multiprocessing is disabled
            iteration_inputs['data_store_path'] = data_store_path
            results = []
            for i in range(iteration):
                results.append(self._iteration_once(**iteration_inputs))

        # Average the result over all iterations
        res = []
        for j in range(len(results[0])):
            res.append([])
            for i in range(len(results[0][j])):
                list_ite = []
                for k in range(iteration):
                    list_ite.append(results[k][j][i])
                res[j].append(np.nanmean(list_ite))

        # This last block is to print out the result
        if hyperparameter_autotune is False:
            # When autotune is off, each row only have one element
            # So we can directly find min and print
            output = []
            for i in res:
                output += i
            min_index = output.index(min(output))
            print('The best method is %s, with %s difference %s' % (recognised_methods[min_index],
                                                                    metric, output[min_index]))
        else:
            # When autotune is on, each row have many elements
            # Need to firstly find the min in each row(each method)
            output = {}
            for i in range(len(res)):
                min_index = res[i].index(min(res[i]))
                output[i] = res[i][min_index]
                if non_responder is False:
                    method = recognised_methods[i]
                    method_string = method.split('-')

                    # Print out the best combination parameters for different methods
                    if method_string[0] == 'Base' or method_string[0] == 'Region':
                        print('%s method has %s difference %s' % (method, metric, res[i][min_index]))
                    elif method_string[0] == 'Age' or method_string[0] == 'AgeRegion':
                        all_ranges = []
                        ordering = []
                        for key in useful_inputs:
                            all_ranges.append(useful_inputs[key])
                            ordering.append(inputs_ordering_normal.index(key))
                        all_combinations = list(product(*all_ranges))
                        best_parameter_value = all_combinations[min_index]
                        ordered_best_parameter_value = []
                        for position in range(len(best_parameter_value)):
                            reorder_position = ordering.index(position)
                            ordered_best_parameter_value.append(best_parameter_value[reorder_position])
                        print('The best %s method achieved when parameter is %s, with %s difference %s'
                              % (method, tuple(ordered_best_parameter_value), metric, res[i][min_index]))

                else:
                    method = recognised_methods[i].split('-')[0]

                    # Print out the best combination parameters for different methods
                    if method == 'Region':
                        all_ranges = []
                        ordering = []
                        for key in useful_inputs_nonrespRange:
                            all_ranges.append(useful_inputs_nonrespRange[key])
                            ordering.append(inputs_ordering_nonrespRegion.index(key))
                        all_combinations = list(product(*all_ranges))
                        best_parameter_value = all_combinations[min_index]
                        ordered_best_parameter_value = []
                        for position in range(len(best_parameter_value)):
                            reorder_position = ordering.index(position)
                            ordered_best_parameter_value.append(best_parameter_value[reorder_position])
                        print('The best %s method achieved when parameter is %s, with %s difference %s'
                              % (method, tuple(ordered_best_parameter_value), metric, res[i][min_index]))
                    elif method == 'AgeRegion':
                        all_ranges = []
                        ordering = []
                        for key in useful_inputs:
                            all_ranges.append(useful_inputs[key])
                            ordering.append(inputs_ordering_nonrespAgeRegion.index(key))
                        all_combinations = list(product(*all_ranges))
                        best_parameter_value = all_combinations[min_index]
                        ordered_best_parameter_value = []
                        for position in range(len(best_parameter_value)):
                            reorder_position = ordering.index(position)
                            ordered_best_parameter_value.append(best_parameter_value[reorder_position])
                        print('The best %s method achieved when parameter is %s, with %s difference %s'
                              % (method, tuple(ordered_best_parameter_value), metric, res[i][min_index]))

            # Find the best method among all methods, and print it out
            min_index = min(output)
            min_value = min(output.values())
            print('The best method is %s, with %s difference %s' % (recognised_methods[min_index], metric, min_value))