Source code for calibration.sub.compute_income

# -*- coding: utf-8 -*-

import numpy as np
import scipy.io
import copy
import math


[docs]def import_transport_costs(grid, param, yearTraffic,
                           households_per_income_class,
                           spline_inflation, spline_fuel,
                           spline_population_income_distribution,
                           spline_income_distribution, path_precalc_inp,
                           path_precalc_transp, dim, options):
    """
    Compute monetary and time costs from commuting for some given year.

    This function leverages the CoCT's EMME/2 transport model for inputs.

    Parameters
    ----------
    grid : DataFrame
        Table yielding, for each grid cell (24,014), its x and y
        (centroid) coordinates, and its distance (in km) to the city
    param : dict
        Dictionary of default parameters
    yearTraffic : int
        Year for which we want to run the function
    households_per_income_class : ndarray(float64)
        Exogenous total number of households per income group (excluding people
        out of employment, for 4 groups)
    spline_inflation : interp1d
        Linear interpolation for inflation rate (in base 100 relative to
        baseline year) over the years (baseline year set at 0)
    spline_fuel : interp1d
        Linear interpolation for fuel price (in rands per km)
        over the years (baseline year set at 0)
    spline_population_income_distribution : interp1d
        Linear interpolation for total population per income group in the data
        (12) over the years (baseline year set at 0)
    spline_income_distribution : interp1d
        Linear interpolation for median annual income (in rands) per income
        group in the data (12) over the years (baseline year set at 0)
    path_precalc_inp : str
        Path for precalcuted input data (calibrated parameters)
    path_precalc_transp : str
        Path for precalcuted transport inputs (intermediate outputs from
        commuting choice model)
    dim : str
        Geographic level of analysis at which we want to run the commuting
        choice model: should be set to "GRID" or "SP"
    options : dict
        Dictionary of default options

    Returns
    -------
    timeOutput : ndarray(float64, ndim=3)
        Duration (in min) of a round trip for each transport mode (5) between
        each selected geographic unit and each selected job center (185)
        centroids
    distanceOutput : ndarray(float64, ndim=3)
        Distance (in km) of a one-way trip for each transport mode (5) between
        each selected geographic unit and each selected job center (185)
        centroids. Note that distances actually do not change across transport
        modes.
    monetaryCost : ndarray(float64, ndim=3)
        Annual monetary cost (in rands) of a round trip for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids
    costTime : ndarray(float64, ndim=3)
        Daily share of working time spent commuting for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids. This will be multiplied by expected income to get the
        opportunity cost of time.

    """
    # STEP 1: IMPORT TRAVEL TIMES AND COSTS

    # Import travel times and distances
    transport_times = scipy.io.loadmat(path_precalc_inp
                                       + 'Transport_times_' + dim)

    # Public fares per km: see Pfeiffer et al. (appendix B2)
    # This is based on Roux (2013), table 4.15 : fares are regressed on
    # distance thresholds. The intercept defines the fixed component and the
    # slope defines the variable component (see Transport-costs.xlsx in Aux
    # data)

    # Note that we take into account the difference in inflation between the
    # year the data was collected and the year of our analysis (can be set to
    # other value than baseline)
    # NB: 40 in variable cost correspond to 20 working days per month, times
    # 2 for the round trip
    priceTrainPerKM = (
        0.164 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
                            )
    priceTrainFixedMonth = (
        4.48 * 40 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
        )
    # Note that bus and taxi had been interverted in Basile's code
    priceBusPerKM = (
        0.785 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
        )
    priceBusFixedMonth = (
        4.32 * 40 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
        )
    # priceTaxiPerKM = (
    #     0.785 * spline_inflation(2011 - param["baseline_year"])
    #     / spline_inflation(2013 - param["baseline_year"])
    #     )
    # priceTaxiFixedMonth = (
    #     4.32 * 40 * spline_inflation(2011 - param["baseline_year"])
    #     / spline_inflation(2013 - param["baseline_year"])
    #     )
    priceTaxiPerKM = (
        0.522 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
        )
    priceTaxiFixedMonth = (
        6.24 * 40 * spline_inflation(2011 - param["baseline_year"])
        / spline_inflation(2013 - param["baseline_year"])
        )
    # priceBusPerKM = (
    #     0.522 * spline_inflation(2011 - param["baseline_year"])
    #     / spline_inflation(2013 - param["baseline_year"])
    #     )
    # priceBusFixedMonth = (
    #     6.24 * 40 * spline_inflation(2011 - param["baseline_year"])
    #     / spline_inflation(2013 - param["baseline_year"])
    #     )

    # Again, we correct for inflation with respect to year the function is run
    inflation = spline_inflation(yearTraffic)
    if options["correct_infla_base"] == 0:
        infla_base = spline_inflation(2012 - param["baseline_year"])
    elif options["correct_infla_base"] == 1:
        infla_base = spline_inflation(2011 - param["baseline_year"])
    priceTrainPerKM = priceTrainPerKM * inflation / infla_base
    priceTrainFixedMonth = priceTrainFixedMonth * inflation / infla_base
    priceTaxiPerKM = priceTaxiPerKM * inflation / infla_base
    priceTaxiFixedMonth = priceTaxiFixedMonth * inflation / infla_base
    priceBusPerKM = priceBusPerKM * inflation / infla_base
    priceBusFixedMonth = priceBusFixedMonth * inflation / infla_base
    priceFuelPerKM = spline_fuel(yearTraffic)

    # Private fixed costs, in rands (variable is already defined by
    # priceFuelPerKM)
    # See Pfeiffer et al. (appendix B2) : corresponds to monthly vehicle
    # depreciation in 2011
    priceFixedVehiculeMonth = 400
    priceFixedVehiculeMonth = priceFixedVehiculeMonth * inflation / infla_base

    # STEP 2: TRAVEL TIMES AND COSTS AS MATRIX

    # Parameters: see Pfeiffer et al. (appendix B2)
    # We assume 8 working hours per day and 20 working days per month
    # numberDaysPerYear = 235
    numberDaysPerYear = 240
    numberHourWorkedPerDay = 8

    # Time taken by each mode for a round trip (in sec)
    # Includes walking time to station and other features from EMME/2 model
    timeOutput = np.empty(
        (transport_times["durationTrain"].shape[0],
         transport_times["durationTrain"].shape[1], 5)
        )
    timeOutput[:] = np.nan

    # To get walking times, we take 2 times the distances by car (to get trips
    # in both directions) multiplied by 1.2 (sinusoity coefficient), divided
    # by the walking speed (in km/h), which we multiply by 60 to get minutes
    # NB: see Viguié et al. (2014), table B.1, for sinusoity estimate

    # if options["correct_round_trip"] == 1:
    #     # Note that we do have some negative durations: their number is small
    #     # so we just convert them to zero
    #     timeOutput[:, :, 0] = (transport_times["distanceCar"]
    #                            / param["walking_speed"] * 60 * 60 * 1.2 * 2)
    #     timeOutput[:, :, 0][np.isnan(transport_times["durationCar"])
    #                         ] = np.nan
    #     timeOutput[:, :, 1] = (
    #          copy.deepcopy(transport_times["durationTrain"])*2)
    #     timeOutput[:, :, 1][transport_times["durationTrain"] < 0] = 0
    #     timeOutput[:, :, 2] = copy.deepcopy(transport_times["durationCar"])*2
    #     timeOutput[:, :, 2][transport_times["durationCar"] < 0] = 0
    #     timeOutput[:, :, 3] = (
    #         copy.deepcopy(transport_times["durationMinibus"])*2)
    #     timeOutput[:, :, 3][transport_times["durationMinibus"] < 0] = 0
    #     timeOutput[:, :, 4] = copy.deepcopy(transport_times["durationBus"])*2
    #     timeOutput[:, :, 4][transport_times["durationBus"] < 0] = 0

    # Note that we do have some negative durations: their number is small,
    # so we just convert them to zero
    timeOutput[:, :, 0] = (transport_times["distanceCar"]
                           / param["walking_speed"] * 60 * 60 * 1.2 * 2)
    timeOutput[:, :, 0][np.isnan(transport_times["durationCar"])] = 0
    timeOutput[:, :, 1] = copy.deepcopy(transport_times["durationTrain"])
    timeOutput[:, :, 1][transport_times["durationTrain"] < 0] = 0
    timeOutput[:, :, 2] = copy.deepcopy(transport_times["durationCar"])
    timeOutput[:, :, 2][transport_times["durationCar"] < 0] = 0
    timeOutput[:, :, 3] = copy.deepcopy(transport_times["durationMinibus"])
    timeOutput[:, :, 3][transport_times["durationMinibus"] < 0] = 0
    timeOutput[:, :, 4] = copy.deepcopy(transport_times["durationBus"])
    timeOutput[:, :, 4][transport_times["durationBus"] < 0] = 0

    # Length (in km) using each mode
    if options["correct_round_trip"] == 1:
        multiplierPrice = np.empty((timeOutput.shape))
        multiplierPrice[:] = np.nan
        multiplierPrice[:, :, 0] = np.zeros((timeOutput[:, :, 0].shape))
        multiplierPrice[:, :, 1] = transport_times["distanceCar"] * 2
        multiplierPrice[:, :, 2] = transport_times["distanceCar"] * 2
        multiplierPrice[:, :, 3] = transport_times["distanceCar"] * 2
        multiplierPrice[:, :, 4] = transport_times["distanceCar"] * 2
    elif options["correct_round_trip"] == 0:
        multiplierPrice = np.empty((timeOutput.shape))
        multiplierPrice[:] = np.nan
        multiplierPrice[:, :, 0] = np.zeros((timeOutput[:, :, 0].shape))
        multiplierPrice[:, :, 1] = transport_times["distanceCar"]
        multiplierPrice[:, :, 2] = transport_times["distanceCar"]
        multiplierPrice[:, :, 3] = transport_times["distanceCar"]
        multiplierPrice[:, :, 4] = transport_times["distanceCar"]

    # Convert the results to annual cost
    pricePerKM = np.empty(5)
    pricePerKM[:] = np.nan
    pricePerKM[0] = np.zeros(1)
    pricePerKM[1] = priceTrainPerKM*numberDaysPerYear
    pricePerKM[2] = priceFuelPerKM*numberDaysPerYear
    pricePerKM[3] = priceTaxiPerKM*numberDaysPerYear
    pricePerKM[4] = priceBusPerKM*numberDaysPerYear

    # Simple distances (useful for residence-workplace distance distribution)
    distanceOutput = np.empty((timeOutput.shape))
    distanceOutput[:] = np.nan
    distanceOutput[:, :, 0] = transport_times["distanceCar"]
    distanceOutput[:, :, 1] = transport_times["distanceCar"]
    distanceOutput[:, :, 2] = transport_times["distanceCar"]
    distanceOutput[:, :, 3] = transport_times["distanceCar"]
    distanceOutput[:, :, 4] = transport_times["distanceCar"]

    # STEP 3 : MONETARY AND TIME COSTS

    # Monetary price per year (for each employment center)
    # NB: 12 is for number of months in a year
    monetaryCost = np.zeros((185, timeOutput.shape[1], 5))
    for index2 in range(0, 5):
        monetaryCost[:, :, index2] = (pricePerKM[index2]
                                      * multiplierPrice[:, :, index2])
    #  Train
    monetaryCost[:, :, 1] = monetaryCost[:, :, 1] + priceTrainFixedMonth * 12
    #  Private car
    monetaryCost[:, :, 2] = (monetaryCost[:, :, 2] + priceFixedVehiculeMonth
                             * 12)
    #  Minibus/taxi
    monetaryCost[:, :, 3] = monetaryCost[:, :, 3] + priceTaxiFixedMonth * 12
    #  Bus
    monetaryCost[:, :, 4] = monetaryCost[:, :, 4] + priceBusFixedMonth * 12

    # We assume that people not travelling a certain distance have an extra
    # high cost of doing so
    monetaryCost[np.isnan(monetaryCost)] = 10 ** 5

    # Daily share of working time spent commuting (all values in sec)
    # NB: The time cost parameter is a proportionality factor between time
    # and monetary values. By default, it is set as 1: the welfare loss from
    # time spent commuting directly translates into foregone wages.
    costTime = (timeOutput * param["time_cost"]
                / (60 * 60 * numberHourWorkedPerDay))

    # We assume that people not travelling a certain distance have an extra
    # high cost of doing so
    costTime[np.isnan(costTime)] = 1

    return timeOutput, distanceOutput, monetaryCost, costTime


[docs]def EstimateIncome(param, timeOutput, distanceOutput, monetaryCost, costTime,
                   job_centers, average_income, income_distribution,
                   list_lambda, options):
    """
    Estimate incomes per job center and number of commuters per distance bin.

    This function leverages the commutingSolve() function to iterate over
    income values until the target number of commuters for each job center is
    reached. This allows to compute incomes per job center (and income group)
    and to find the split of commuters across distance-from-CBD brackets, for
    several values of the gravity parameter (from commuting choice model).

    Parameters
    ----------
    param : dict
        Dictionary of default options
    timeOutput : ndarray(float64, ndim=3)
        Duration (in min) of a round trip for each transport mode (5) between
        each selected geographic unit and each selected job center (185)
        centroids
    distanceOutput : ndarray(float64, ndim=3)
        Distance (in km) of a one-way trip for each transport mode (5) between
        each selected geographic unit and each selected job center (185)
        centroids. Note that distances actually do not change across transport
        modes.
    monetaryCost : ndarray(float64, ndim=3)
        Annual monetary cost (in rands) of a round trip for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids
    costTime : ndarray(float64, ndim=3)
        Daily share of working time spent commuting for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids. This will be multiplied by expected income to get the
        opportunity cost of time.
    job_centers : ndarray(float64, ndim=2)
        Number of jobs in each selected job center (185) per income group (4).
        Remember that we rescale the number of individual jobs to reflect total
        household employment, as our income and population data are for
        households only: one job basically provides employment for two people.
        This simplification allows to model households as a single
        representative agent and to abstract from a two-body problem.
        Empirically, this holds on aggregate as households' position on the
        labor market is often determined by one household head.
    average_income : ndarray(float64)
        Average median income for each income group in the model (4)
    income_distribution : ndarray(uint16, ndim=2)
        Exogenous number of households in each Small Place (1,046) for each
        income group in the model (4)
    list_lambda : ndarray(float64)
        List of values over which to scan for the gravity parameter used in
        the commuting choice model
    options : dict
        Dictionary of default options

    Returns
    -------
    incomeCentersSave : ndarary(float64, ndim=3)
        Calibrated annual household income (in rands) for each income group (4)
        and each selected job center (185), for each scanned value of the
        gravity parameter
    distanceDistribution : ndarray(float64, ndim=2)
        Share of residence-workplace distances in each 5-km from CBD bracket
        for each scanned value of the gravity parameter
    scoreMatrix : ndarray(float64, ndim=2)
        Ratio of simulated over observed (rescaled) number of jobs for each
        income group (4) and each scanned value of the gravity parameter:
        defines an error metric for the quality of our calibration

    """
    # Setting time and space
    annualToHourly = 1 / (8*20*12)
    monetary_cost = monetaryCost * annualToHourly
    #  Corresponds to the brackets for which we have aggregate statistics on
    #  the number of commuters to fit our calibration
    bracketsDistance = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 200])

    #  We initialize output matrices
    incomeCentersSave = np.zeros((len(job_centers[:, 0]), 4, len(list_lambda)))
    distanceDistribution = np.zeros(
        (len(bracketsDistance) - 1, len(list_lambda)))
    scoreMatrix = np.zeros((len(list_lambda), param["nb_of_income_classes"]))

    # We begin simulations for different values of lambda

    for i in range(0, len(list_lambda)):

        param_lambda = list_lambda[i]

        print('Estimating for lambda = ', param_lambda)

        # We initialize output vectors for each lambda
        incomeCentersAll = -math.inf * np.ones((len(job_centers[:, 0]), 4))
        distanceDistributionGroup = np.zeros((len(bracketsDistance) - 1, 4))

        # We run separate simulations for each income group

        for j in range(0, param["nb_of_income_classes"]):

            # (Employed) household size varies with income group / transport
            # costs: we select the income group of interest
            householdSize = param["household_size"][j]
            # Same for "average" income (which needs to be adapted to hourly
            # from income data)
            averageIncomeGroup = average_income[j] * annualToHourly

            print('incomes for group ', j)

            # We consider job centers where selected income group represents
            # more than 1/4 of the number-of-jobs threshold (used to select
            # job centers among transport zones): it allows to avoid marginal
            # crossings between income classes and job centers, hence reduces
            # the number of equations to solve and makes optimization faster
            # (+ no corner cases)

            # NB: numeric solver (gradient descent) does not work when
            # function is not always differentiable (which is the case here as,
            # above/below some utility threshold, we have tipping effects),
            # hence we code our own solver to remain in the interior

            whichCenters = job_centers[:, j] > 600
            # whichCenters = (
            #     job_centers[:, j] > param["job_center_threshold"] / 4)
            popCenters = job_centers[whichCenters, j]

            # We reweight population in each income group per SP to make it
            # comparable with population in SELECTED job centers
            # Note that unemployed population is not included! This is
            # important with what follows.
            # Also note that SP data includes more areas than included in grid
            popResidence = (
                income_distribution[:, j]
                * sum(job_centers[whichCenters, j])
                / sum(income_distribution[:, j])
            )

            # Numeric parameters come from trial and error and do not change
            # results a priori: they just help convergence
            maxIter = 1000
            tolerance = 0.01
            if j == 0:
                factorConvergenge = (
                    0.01 / 100
                    * (np.log(param_lambda**10 / 10**10) + np.log(10**10))
                    )
            elif j == 1:
                factorConvergenge = (
                    0.002 / 100
                    * (np.log(param_lambda**10 / 10**10) + np.log(10**10))
                    )
            elif j == 2:
                factorConvergenge = (
                    0.001 / 100
                    * (np.log(param_lambda**10 / 10**10) + np.log(10**10))
                    )
            elif j == 3:
                factorConvergenge = (
                    0.0008 / 100
                    * (np.log(param_lambda**10 / 10**10) + np.log(10**10))
                    )

            iter = 0
            error = np.zeros((len(popCenters), maxIter))
            scoreIter = np.zeros(maxIter)
            errorMax = 1

            # Initializing the solver
            incomeCenters = np.zeros((sum(whichCenters), maxIter))
            incomeCenters[:, 0] = (
                averageIncomeGroup
                * (popCenters / np.nanmean(popCenters))
                ** (0.1)
            )
            # Initial error corresponds to the difference between observed
            # and simulated population working in each job center
            error[:, 0], _ = commutingSolve(
                incomeCenters[:, 0], averageIncomeGroup, popCenters,
                popResidence, monetary_cost, costTime, param_lambda,
                householdSize, whichCenters, bracketsDistance, distanceOutput,
                options)

            # Then we iterate by adding to each job center income the average
            # value of income per income group, weighted by the importance of
            # the error relative to the observed job center population: if we
            # underestimate the population, we increase the income
            while ((iter <= maxIter - 1) & (errorMax > tolerance)):

                incomeCenters[:, iter] = (
                    incomeCenters[:, max(iter-1, 0)]
                    + factorConvergenge
                    * averageIncomeGroup
                    * error[:, max(iter - 1, 0)]
                    / popCenters
                )

                # We also update the error term and store some values
                # before iterating over
                error[:, iter], _ = commutingSolve(
                    incomeCenters[:, iter], averageIncomeGroup, popCenters,
                    popResidence, monetary_cost, costTime, param_lambda,
                    householdSize, whichCenters, bracketsDistance,
                    distanceOutput, options)

                errorMax = np.nanmax(
                    np.abs(error[:, iter] / popCenters))
                scoreIter[iter] = np.nanmean(
                    np.abs(error[:, iter] / popCenters))

                iter = iter + 1

                # print(iter)
                # print("errorMax = " + str(errorMax))
                # print("errorMean = "
                #       + str(np.nanmean(np.abs(error[:, iter] / popCenters))))

            # At the end of the process, we keep the minimum score, and define
            # the corresponding best solution for some lambda and income group
            if (iter > maxIter - 1):
                scoreBest = np.amin(scoreIter)
                scoreMatrix[i, j] = scoreBest
                bestSolution = np.argmin(scoreIter)
                incomeCenters[:, iter-1] = incomeCenters[:, bestSolution]
                print(' - max iteration reached - mean error', scoreBest)

            # If we manage to have a maximum error that falls under the
            # tolerance threshold, we leave the loop and consider the solution
            # corresponding to the latest iteration
            else:
                scoreBest = scoreIter[iter-1]
                scoreMatrix[i, j] = scoreBest
                print(' - computed - max error', errorMax)
                # print(str(iter-1))

            # We also get (for the given income group) the number of commuters
            # for all job centers in given distance brackets
            _, distanceDistributionGroup[:, j] = commutingSolve(
                incomeCenters[:, iter-1], averageIncomeGroup, popCenters,
                popResidence, monetary_cost, costTime, param_lambda,
                householdSize, whichCenters, bracketsDistance, distanceOutput,
                options)

            # We rescale calibrated incomes to stick to overall income data
            # scale: remember that we only computed them for a subset of the
            # population. This allows income levels to be more representative
            # (although this does not change anything in relative terms)
            # NB: note that here, the way we define "average" income from the
            # data actually impact the rescaling
            incomeCentersRescaled = (
                incomeCenters[:, iter-1]
                * averageIncomeGroup
                / (np.nansum(incomeCenters[:, iter-1] * popCenters)
                    / np.nansum(popCenters))
            )

            # Then we update the output matrix for the given income group
            # (with lambda still fixed)
            incomeCentersAll[whichCenters, j] = incomeCentersRescaled

        # We can now loop over different values of lambda and store values back
        # in yearly format for incomes
        incomeCentersSave[:, :, i] = incomeCentersAll / annualToHourly

        # Likewise, for each value of lambda, we store the % of total commuters
        # for each distance bracket
        distanceDistribution[:, i] = (
            np.nansum(distanceDistributionGroup, 1)
            / np.nansum(distanceDistributionGroup)
        )

    return incomeCentersSave, distanceDistribution, scoreMatrix


[docs]def compute_ODflows(householdSize, monetaryCost, costTime, incomeCentersFull,
                    whichCenters, param_lambda, options):
    """
    Compute probability distribution and transport costs of commuting pairs.

    This function applies theoretical formulas from the commuting choice model,
    for a given income group and a given gravity parameter (see math appendix
    for more details).

    Parameters
    ----------
    householdSize : float
        Average number of employed workers per household for one income class
        (corresponds to 2 times the employment rate)
    monetaryCost : ndarray(float64, ndim=3)
        Hourly monetary cost (in rands) of a round trip for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids
    costTime : ndarray(float64, ndim=3)
        Daily share of working time spent commuting for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids. This will be multiplied by expected income to get the
        opportunity cost of time.
    incomeCentersFull : ndarray(float64)
        Hourly household income (in rands) for some income group and some
        iteration, per selected job center, rescaled to match income
        distribution across the overall population
    whichCenters : ndarray(bool)
        Dummy variable allowing the narrow selection of job centers used for
        income calibration for a given income group, among pre-selected job
        centers (185)
    param_lambda : float64
        Tested value for the gravity parameter used in the commuting choice
        model

    Returns
    -------
    transportCostModes : ndarray(float64, ndim=3)
        Expected (hourly) total commuting cost (in rands) for one agent,
        for all (narrowly) selected job centers, all SPs of residence (1046),
        and all transport modes (5)
    transportCost : ndarray(float64, ndim=2)
        Expected (hourly) total commuting cost (in rands) for one agent,
        for all (narrowly) selected job centers and all SPs of residence
        (1046): corresponds to min(transportCostModes) across transport modes.
    ODflows : ndarray(float64, ndim=2)
        Probability, for a given income group, to work in each (narrowly)
        selected job center for each potential SP of residence (1046)
    valueMax : ndarray(float64, ndim=2)
        Numerical parameter for each commuting pair that prevents logarithms
        and exponentials used in calculations from diverging towards infinity:
        it corresponds to the maximum argument for exponentials that appear in
        the formula for transportCost: this is not used as an input in other
        functions and is just included here for reference
    minIncome : ndarray(float64)
        Numerical parameter for each SP of residence that prevents logarithms
        and exponentials used in calculations from diverging towards infinity:
        it corresponds to minus the minimum argument for exponentials that
        appear in the formula for ODflows: this is not used as an input in
        other functions and is just included here for reference

    """
    # We first compute expected (hourly) total commuting cost per mode
    # Note that incomeCentersFull is already defined at the household level,
    # whereas monetaryCost is defined at the individual level: hence, we need
    # to multiply monetaryCost by householdsSize to get the value of the
    # monetary costs for all members of the households
    transportCostModes = (
        householdSize * monetaryCost[whichCenters, :, :]
        + (costTime[whichCenters, :, :] * incomeCentersFull[:, None, None])
    )

    # This prevents exponentials/logarithms from diverging towards infinity
    # This is neutral on the result and is set by trial and error
    valueMax = np.nanmin(param_lambda * transportCostModes, 2) - 500

    # We get expected commuting cost by minimizing transportCostModes across
    # transport modes. This yields the formula below (see log-sum
    # specification in math appendix)

    # Note that the valueMax term ensures some minimal value for the
    # exponential term, hence a value sufficiently bigger than zero in the
    # logarithm term. Then, its impact is cancelled out by substracting it
    # aside from the log-term.

    # Actually, this is not needed in this version of the model
    # if options["prevent_exp_overflow"] == 1:
    #     transportCost = (
    #         - 1 / param_lambda
    #         * np.log(
    #             np.nansum(np.exp(- param_lambda * transportCostModes
    #                               + valueMax[:, :, None]), 2))
    #         - valueMax
    #     )
    transportCost = (
        - 1 / param_lambda
        * np.log(
            np.nansum(np.exp(- param_lambda * transportCostModes), 2))
    )

    # Also for exponential/logarithm convergence
    minIncome = (np.nanmax(
        param_lambda * (incomeCentersFull[:, None] - transportCost), 0) - 700
        )

    # Then, we compute probability distribution of each commuting pair, from
    # discrete choice modelling.
    # Note that the minIncome term ensures that the values in the exponential
    # terms do not grow too big. Then, its impact is cancelled out as it
    # appears both in the numerator and the denominator of the fraction.
    # NB: here, we need to divide by the scale factor (typo in Pfeiffer et al.)

    if options["prevent_exp_overflow"] == 1:
        ODflows = (
            np.exp(param_lambda * (incomeCentersFull[:, None] - transportCost)
                   - minIncome)
            / np.nansum(
                np.exp(param_lambda
                       * (incomeCentersFull[:, None] - transportCost)
                       - minIncome),
                0)[None, :]
        )
    elif options["prevent_exp_overflow"] == 0:
        ODflows = (
            np.exp(param_lambda * (incomeCentersFull[:, None] - transportCost))
            / np.nansum(
                np.exp(param_lambda
                       * (incomeCentersFull[:, None] - transportCost)),
                0)[None, :]
        )

    return transportCostModes, transportCost, ODflows, valueMax, minIncome


[docs]def commutingSolve(incomeCentersTemp, averageIncomeGroup, popCenters,
                   popResidence, monetaryCost, costTime, param_lambda,
                   householdSize, whichCenters, bracketsDistance,
                   distanceOutput, options):
    """
    Compute error and distribution for job allocation simulated from incomes.

    This function leverages the compute_ODflows() function to compute a
    theoretical equivalent to number of jobs in each (narrowly) selected job
    center, and the distribution of those jobs across pre-defined
    distance-from-CBD brackets.

    Parameters
    ----------
    incomeCentersTemp : ndarray(float64)
        Hourly household income (in rands) for some income group and some
        iteration, per selected job center
    averageIncomeGroup : float64
        Average hourly median income (from data) for some income group
    popCenters : ndarray(float64)
        Number of jobs in each (narrowly) selected job center for some income
        group. Remember that we further restrict the set of selected job
        centers when calibrating incomes per income group for the sake of
        numerical simplicity. Also remember that we rescale the number of
        individual jobs to reflect total household employment, as our income
        and population data are for households only: one job basically provides
        employment for two people.
        This simplification allows to model households as a single
        representative agent and to abstract from a two-body problem.
        Empirically, this holds on aggregate as households' position on the
        labor market is often determined by one household head.
    popResidence : ndarray(float64)
        Number of households living in each SP (1046), per income group (4):
        comes from census data and does not include people not working
    monetaryCost : ndarray(float64, ndim=3)
        Hourly monetary cost (in rands) of a round trip for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids
    costTime : ndarray(float64, ndim=3)
        Daily share of working time spent commuting for each transport mode
        (5) between each selected geographic unit and each selected job center
        (185) centroids. This will be multiplied by expected income to get the
        opportunity cost of time.
    param_lambda : float64
        Tested value for the gravity parameter used in the commuting choice
        model
    householdSize : float
        Average number of employed workers per household for one income class
        (corresponds to 2 times the employment rate)
    whichCenters : ndarray(bool)
        Dummy variable allowing the narrow selection of job centers used for
        income calibration for a given income group, among pre-selected job
        centers (185)
    bracketsDistance : ndarray(int32)
        Array of floor values for distance brackets used to select the set of
        incomes + gravity parameter that best fit the distribution of
        residence-workplace distances according to those brackets
    distanceOutput : ndarray(float64, ndim=3)
        Distance (in km) of a one-way trip for each transport mode (5) between
        each selected geographic unit and each selected job center (185)
        centroids. Note that distances actually do not change across transport
        modes.
    options : dict
        Dictionary of default options

    Returns
    -------
    score : ndarray(float64)
        Difference between observed and simulated population working in each
        (narrowly) selected job center (for a given income group)
    nbCommuters : ndarray(float64)
        Number of commuters in each pre-defined distance bracket, for a given
        income group

    """
    # We redress the average income in each group per job center to match
    # income data, as resulting ODflows (see below) must be matched with
    # popResidence (defined for the overall population, and not a subset of
    # narrowly selected job centers)
    incomeCentersFull = (
        incomeCentersTemp
        * averageIncomeGroup
        / ((np.nansum(incomeCentersTemp * popCenters)
            / np.nansum(popCenters)))
    )

    # We are essentially interested in the ODflows matrix, that is the
    # probability, for a given income group, to work in each (narrowly)
    # selected job center for each potential SP of residence
    transportCostModes, transportCost, ODflows, *_ = compute_ODflows(
        householdSize, monetaryCost, costTime, incomeCentersFull,
        whichCenters, param_lambda, options)

    # We compare the true population distribution with its theoretical
    # equivalent computed from simulated income distribution: the closer
    # the score is to zero, the better (see math appendix)

    # NB: correction is not needed a priori, as income distribution data from
    # SP does not include people out of employment
    if options["correct_eq3"] == 1:
        score = (popCenters
                 - (householdSize / 2
                    * np.nansum(popResidence[None, :] * ODflows, 1))
                 )
    elif options["correct_eq3"] == 0:
        score = (popCenters
                 - np.nansum(popResidence[None, :] * ODflows, 1))

    # We also return the total number of commuters (in a given income group)
    # for (narrowly) selected job centers in predefined distance brackets
    nbCommuters = np.zeros(len(bracketsDistance) - 1)
    for k in range(0, len(bracketsDistance)-1):
        which = ((distanceOutput[whichCenters, :] > bracketsDistance[k])
                 & (distanceOutput[whichCenters, :] <= bracketsDistance[k + 1])
                 & (~np.isnan(distanceOutput[whichCenters, :])))
        if options["correct_eq3"] == 1:
            nbCommuters[k] = (
                np.nansum(which * ODflows * popResidence[None, :])
                * (householdSize / 2)
                )
        elif options["correct_eq3"] == 0:
            nbCommuters[k] = (
                np.nansum(which * ODflows * popResidence[None, :]))

    return score, nbCommuters