Source code for calibration.calib_main_func

# -*- coding: utf-8 -*-

import numpy as np
import numpy.matlib
import statsmodels.api as sm
import os
import math
import scipy

import calibration.sub.compute_income as calcmp
import calibration.sub.import_employment_data as calemp
import calibration.sub.estimate_parameters_by_scanning as calscan
import calibration.sub.estimate_parameters_by_optimization as calopt
import calibration.sub.import_amenities as calam


[docs]def estim_construct_func_param(options, param, data_sp,
                               threshold_income_distribution,
                               income_distribution, data_rdp, housing_types_sp,
                               data_number_formal, data_income_group,
                               selected_density,
                               path_data, path_precalc_inp, path_folder):
    """
    Estimate coefficients of housing production function (Cobb-Douglas).

    This function leverages a partial relation from our general equilibrium
    model, that is estimated on SP data which does not enter the simulations
    as an input. More precisely, it combines the expression of the optimal
    housing supply in the formal private sector with the highest bid condition
    (see technical documentation for math formulas).

    Parameters
    ----------
    options : dict
        Dictionary of default options
    param : dict
        Dictionary of default parameters
    data_sp : DataFrame
        Table yielding, for each Small Place (1,046), the average dwelling size
        (in m²), the average land price and annual income level (in rands),
        the size of unconstrained area for construction (in m²), the total area
        (in km²), the distance to the city centre (in km), whether or not the
        location belongs to Mitchells Plain, and the SP code
    threshold_income_distribution : ndarray(int32)
        Annual income level (in rands) above which a household is taken as
        being part of one of the 4 income groups in the model
    income_distribution : ndarray(uint16, ndim=2)
        Exogenous number of households in each Small Place (1,046) for each
        income group in the model (4)
    data_rdp : DataFrame
        Table yielding, for each grid cell (24,014), the associated cumulative
        count of cells with some formal subsidized housing, and the associated
        area (in m²) dedicated to such housing
    housing_types_sp : DataFrame
        Table yielding, for each Small Place (1,046), the number of informal
        backyards, of informal settlements, and total dwelling units, as well
        as their (centroid) x and y coordinates
    data_number_formal : Series
        Number of formal private housing units considered for each Small Place
        (1,046)
    data_income_group : ndarray(float64)
        Categorical variable indicating, for each Small Place (1,046), the
        dominant income group (from 0 to 3)
    selected_density : Series
        Dummy variable allowing for sample selection across Small Places
        (1,046) for regressions that are only valid in the formal private
        housing sector
    path_data : str
        Path towards data used in the model
    path_precalc_inp : str
        Path for precalcuted input data (calibrated parameters)
    path_folder : str
        Path towards the root data folder

    Returns
    -------
    coeff_b : float64
        Calibrated capital elasticity in housing production function
    coeff_a : float64
        Calibrated land elasticity in housing production function
    coeffKappa : float64
        Calibrated scale factor in housing production function

    """
    # We define our outcome variable
    y = np.log(data_number_formal[selected_density])
    # We define our independent variables
    # Note that we use data_sp["unconstrained_area"] (which is accurate data at
    # SP level) rather than coeff_land (which is an estimate at grid level)
    X = np.transpose(
        np.array([np.ones(len(data_sp["price"][selected_density])),
                  np.log(data_sp["price"][selected_density]),
                  np.log(data_sp["dwelling_size"][selected_density]),
                  np.log(param["max_land_use"]
                         * data_sp["unconstrained_area"][selected_density])])
        )
    # NB: Our data set for dwelling sizes only provides the average dwelling
    # size at the Sub-Place level, aggregating formal and informal housing

    # We run the linear regression
    modelSpecification = sm.OLS(y, X, missing='drop')
    model_construction = modelSpecification.fit()
    print(model_construction.summary())
    parametersConstruction = model_construction.params

    # We export outputs of the model
    coeff_b = parametersConstruction["x1"]
    coeff_a = 1 - coeff_b
    # Scale factor formula comes from zero profit condition combined with
    # footnote 16 from Pfeiffer et al. (typo in original paper)
    if options["correct_kappa"] == 1:
        coeffKappa = ((1 / (coeff_b / coeff_a) ** coeff_b)
                      * np.exp(parametersConstruction["const"]))
    elif options["correct_kappa"] == 0:
        coeffKappa = ((1 / (coeff_b) ** coeff_b)
                      * np.exp(parametersConstruction["const"]))

    try:
        os.mkdir(path_precalc_inp)
    except OSError as error:
        print(error)

    np.save(path_precalc_inp + 'calibratedHousing_b.npy', coeff_b)
    np.save(path_precalc_inp + 'calibratedHousing_kappa.npy', coeffKappa)

    return coeff_b, coeff_a, coeffKappa


[docs]def estim_incomes_and_gravity(param, grid, list_lambda,
                              households_per_income_class,
                              average_income, income_distribution,
                              spline_inflation, spline_fuel,
                              spline_population_income_distribution,
                              spline_income_distribution,
                              path_data, path_precalc_inp,
                              path_precalc_transp, options):
    """
    Estimate incomes per job center and income group, with gravity parameter.

    This function leverages theoretical formulas from
    calibration.sub.compute_income and data imported through the
    calibration.sub.import_employment_data module. It first import transport
    costs and observed number of commuters per selected job centre and income
    group, then estimates the associated incomes for a given gravity parameter
    by minimizing the error over the simulated number of commuters. Then, it
    selects among a list of scanned values the final value of the gravity
    parameter (and the associated incomes) by minimizing the error over the
    distribution of commuters along their residence-workplace distances.

    Parameters
    ----------
    param : dict
        Dictionary of default parameters
    grid : DataFrame
        Table yielding, for each grid cell (24,014), its x and y
        (centroid) coordinates, and its distance (in km) to the city centre
    list_lambda : ndarray(float64)
        List of values over which to scan for the gravity parameter used in
        the commuting choice model
    households_per_income_class : ndarray(float64)
        Exogenous total number of households per income group (excluding people
        out of employment, for 4 groups)
    average_income : ndarray(float64)
        Average median income for each income group in the model (4)
    income_distribution : ndarray(uint16, ndim=2)
        Exogenous number of households in each Small Place (1,046) for each
        income group in the model (4)
    spline_inflation : interp1d
        Linear interpolation for inflation rate (in base 100 relative to
        baseline year) over the years (baseline year set at 0)
    spline_fuel : interp1d
        Linear interpolation for fuel price (in rands per km)
        over the years (baseline year set at 0)
    spline_population_income_distribution : interp1d
        Linear interpolation for total population per income group in the data
        (12) over the years (baseline year set at 0)
    spline_income_distribution : interp1d
        Linear interpolation for median annual income (in rands) per income
        group in the data (12) over the years (baseline year set at 0)
    path_data : str
        Path towards data used in the model
    path_precalc_inp : str
        Path for precalcuted input data (calibrated parameters)
    path_precalc_transp : str
        Path for precalcuted transport inputs (intermediate outputs from
        commuting choice model)
    options : dict
        Dictionary of default options

    Returns
    -------
    incomeCentersKeep : ndarray(float64, ndim=2)
        Calibrated average annual household income (including unemployment)
        for each income group (4), per grid cell (24,014)
    lambdaKeep : float64
        Calibrated gravity parameter from the commuting choice model
    cal_avg_income : ndarray(float64)
        Overall calibrated average income across income groups (4), for
        validation only
    scoreKeep : ndarray(float64)
        Mean ratio of simulated over observed number of commuters per job
        center (185), collapsed at the income-group level (4): captures the
        error over our calibrated parameters
    bhattacharyyaDistances : ndarray(float64)
        Bhattacharyya distances (measure the similarity of two probability
        distributions) between the calculated distribution of commuting
        distances and aggregates from the Transport Survey, for each scanned
        value of the gravity parameter. This is used as an auxiliary measure
        to pin down a unique gravity parameter (and associated matrix of
        incomes), and is only given as an output of the function for reference.

    """
    # We import number of workers in each selected job center.
    # Note that it is rescaled to match aggregate income distribution in census
    job_centers = calemp.import_employment_data(
        households_per_income_class, param, path_data)

    # We import transport cost data.
    # Note that we reason at the SP level here. Also note that we are
    # considering round trips and households made up of two people.
    (timeOutput, distanceOutput, monetaryCost, costTime
     ) = calcmp.import_transport_costs(
         grid, param, 0, households_per_income_class,
         spline_inflation, spline_fuel, spline_population_income_distribution,
         spline_income_distribution,
         path_precalc_inp, path_precalc_transp, 'SP', options)

    # Note that this is long to run.
    # Here again, we are considering rescaled income data.
    (incomeCenters, distanceDistribution, scoreMatrix
     ) = calcmp.EstimateIncome(
         param, timeOutput, distanceOutput[:, :, 0], monetaryCost, costTime,
         job_centers, average_income, income_distribution, list_lambda,
         options)

    # Gives aggregate statistics for % of commuters per distance bracket
    # NB: bracketsDistance = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 200])
    # with floor residence-workplace distances in km
    # (see calibration.sub.compute_income)

    # NB: we could check differential mobility patterns as a robustness check
    # by estimating a distinct gravity parameter for each income group if we
    # get access to a breakdown of residence-workplace distance distribution
    # per income group

    # Residence-workplace distance distribution comes from the CoCT's 2013
    # Transport Survey
    data_distance_distribution = np.array(
        [45.6174222, 18.9010734, 14.9972971, 9.6725616, 5.9425438, 2.5368754,
         0.9267125, 0.3591011, 1.0464129])

    # Compute accessibility index
    # NB1: Bhattacharyya distance measures the similarity of two probability
    # distributions (here, data vs. simulated % of commuters)
    # NB2: Mahalanobis distance is a particular case of the Bhattacharyya
    # distance when the standard deviations of the two classes are the same
    bhattacharyyaDistances = (
        - np.log(np.nansum(np.sqrt(data_distance_distribution[:, None]
                                   / 100 * distanceDistribution), 0))
        )
    whichLambda = np.argmin(bhattacharyyaDistances)

    # Hence, we keep the lambda that minimizes the distance and the associated
    # income vector
    lambdaKeep = list_lambda[whichLambda]
    incomeCentersKeep = incomeCenters[:, :, whichLambda]

    # We also keep the associated error metric
    # scoreKeep = scoreMatrix[whichLambda, :]
    scoreKeep = scoreMatrix

    # Note that income is set to -inf for job centers and income groups in
    # which it could not be calibrated

    np.save(path_precalc_inp + 'incomeCentersKeep.npy', incomeCentersKeep)
    np.save(path_precalc_inp + 'lambdaKeep.npy', lambdaKeep)

    # Note that it is unclear whether "average" income from data includes
    # unemployment or not: a priori, it does for short spells (less than one
    # year) and should therefore be slightly bigger than calibrated income
    # (which should reflect all unemployment): this is what we observe in
    # practice
    incomeCentersKeep[incomeCentersKeep < 0] = math.nan
    cal_avg_income = np.nanmean(incomeCentersKeep, 0)

    return (incomeCentersKeep, lambdaKeep, cal_avg_income, scoreKeep,
            bhattacharyyaDistances)


[docs]def estim_util_func_param(data_number_formal, data_income_group,
                          housing_types_sp, data_sp,
                          coeff_a, coeff_b, coeffKappa, interest_rate,
                          incomeNetOfCommuting,
                          path_data, path_precalc_inp,
                          options, param):
    """
    Calibrate utility function parameters.

    This function leverages the following modules: import_amenities,
    estimate_parameters_by_scanning, and estimate_parameters_by_optimization.
    As before, we use partial relations coming from our general equilibrium
    structure (see technical documentation for math formulas). This time, we
    look at the utility function parameters that maximize a composite
    likelihood function for the fit on observed amenities, dwelling sizes, and
    population sorting by income (see calibration.sub.loglikelihood module).
    We proceed first by scanning over a discrete range of parameter values,
    then by running a smooth solver taking outputs from scanning as initial
    values.

    Parameters
    ----------
    data_number_formal : Series
        Number of formal private housing units considered for each Small Place
        (1,046)
    data_income_group : ndarray(float64)
        Categorical variable indicating, for each Small Place (1,046), the
        dominant income group (from 0 to 3)
    housing_types_sp : DataFrame
        Table yielding, for each Small Place (1,046), the number of informal
        backyards, of informal settlements, and total dwelling units, as well
        as their (centroid) x and y coordinates
    data_sp : DataFrame
        Table yielding, for each Small Place (1,046), the average dwelling size
        (in m²), the average land price and annual income level (in rands),
        the size of unconstrained area for construction (in m²), the total area
        (in km²), the distance to the city centre (in km), whether or not the
        location belongs to Mitchells Plain, and the SP code
    coeff_a : float64
        Calibrated land elasticity in housing production function
    coeff_b : float64
        Calibrated capital elasticity in housing production function
    coeffKappa : float64
        Calibrated scale factor in housing production function
    interest_rate : float64
        Interest rate for the overall economy, corresponding to an average
        over past years
    incomeNetOfCommuting : ndarray(float64, ndim=2)
        Expected annual income net of commuting costs (in rands, for
        one household), for each geographic unit, by income group (4)
    path_data : str
        Path towards data used in the model
    path_precalc_inp : str
        Path for precalcuted input data (calibrated parameters)
    options : dict
        Dictionary of default options
    param : dict
        Dictionary of default parameters

    Returns
    -------
    calibratedUtility_beta : float64
        Calibrated surplus housing elasticity in households' utility function
    calibratedUtility_q0 : float64
        Parametric basic need in housing (in m²). Note that this parameter is
        not an output of the calibration per se, as it is exogenously set. It
        is included here for reference as it enters the households' utility
        function and enters the optimization programme as an input. Note that
        this could be optimized over (as in Pfeiffer et al.), but only within
        a narrow range of values to preserve feasibilty of allocations.
    cal_amenities : ndarray(float64)
        Calibrated amenity index for each grid cell (24,014): this is not
        normalized yet.

    """
    # We select in which areas we actually measure the likelihood.
    # We use a less stringent definition as for the fit on building density
    # (estimation of construction function parameters), as we are going to
    # separately add conditions more specific to each regression used in the
    # process.
    # NB: There is a trade-off between empirical validity and statistical power
    selectedSP = (
        (data_number_formal > 0.90 * housing_types_sp.total_dwellings_SP_2011)
        & (data_income_group > 0)
        )

    # We are going to scan over values on which to optimize

    # As explained in the parameters_and_options module, we consider that
    # benchmark value from Finlay and Williams (2022) is more robust than
    # our own estimation for the elasticity of housing demand. We therefore
    # pin this value down as default. Note that we can define a range of
    # acceptable values for sensitivity checks.

    listBeta = scipy.io.loadmat(
                path_precalc_inp + 'calibratedUtility_beta.mat'
                )["calibratedUtility_beta"].squeeze()
    # listBeta = 0.25
    # listBeta = np.arange(0.1, 0.41, 0.02)

    # Again, we pin the value of basic need in housing to the one estimated
    # in Pfeiffer et al., to serve as a placeholder for later empirical
    # calibration (see parameters_and_options module) and to reduce the
    # numerical complexity of the optimization process (that may converge
    # towards multiple optima)

    listBasicQ = scipy.io.loadmat(
                path_precalc_inp + 'calibratedUtility_q0.mat'
                )["calibratedUtility_q0"].squeeze()
    # listBasicQ = 4

    # Coefficient for spatial autocorrelation (not used)
    listRho = 0

    # Target utility levels used to define amenity score: we take levels close
    # to what we expect in equilibrium
    # utilityTarget = np.array([400, 900, 5400, 26000])
    utilityTarget = np.array([1200, 4800, 16000, 77000])

    # Then, we only allow utilities for the two richest income groups to vary,
    # for the sake of numerical simplicity and as they will drive most of the
    # changes in absolute values. Actually, we even exclude the poorest income
    # group from the analysis, again for the sake of numerical simplicity and
    # as it will in practice be crowded out of the formal private sector
    # (which drives most of our identification).

    # Compared to Pfeiffer et al., we only fit the data moment on exogenous
    # amenities when pinning down the values of beta and q0 (see definition
    # of the composite log-likelihood in the estimate_parameters modules).
    # This is because the amenity score is the only parameter left to
    # calibrate, and we want it to be predicted in the most acurrate way
    # by our explanatory covariates. Consequently, the selected values for the
    # utility levels will not necessarily match the endogenous values we get as
    # an equilibrium outcome: this is because they are selected so as to
    # minimize the component of the theoretical amenity score that is left
    # unexplained by chosen exogenous covariates. Indeed, we will keep the
    # explained component for the definition of the amenity score used in our
    # simulations.

    # Alternatively, we could obtain this score from model inversion (as we do
    # later with the disamenity factor from living in informal backyards
    # / settlements), implicitly defining it as a residual that allows the
    # model to fit the data better. In more intuitive terms, this boils down to
    # optimizing over the value of the amenity score at the same time as
    # utility levels when solving the equilibrium (remember that we consider
    # a closed-city model where total population per income group is exogenous
    # and utility levels are endogenous). This approach is standard in
    # quantitative spatial economic models (see Redding and Rossi-Hansberg,
    # 2017 for more details). However, we choose to model the score explicitly,
    # defining it a priori as a calibrated parameter, and only solving for
    # utility levels (and disamenity factors) in equilibrium.
    # NB: The approach taken in Pfeiffer et al., where the score is calibrated
    # a priori, but along other data moments, can be seen as an intermediate
    # between the two approaches presented above.

    # This approach comes with the risk of a poorer overall fit. However, if
    # validation shows that model predictions are good enough for use in spite
    # of this limitation, the approach also brings clear benefits. First,
    # it prevents the model from overfitting, which increases its external
    # validity when assessing counterfactuals. Then, the amenity score has a
    # clear empirical interpretation, which allows to assess more specifically
    # policies directed towards amenities (translating into a change of values
    # for some exogenous covariates, that can be direcly mapped to a change in
    # equilibrium outcomes).

    listVariation = np.arange(0.5, 1.5, 0.01)
    initUti2 = utilityTarget[1]
    listUti3 = utilityTarget[2] * listVariation
    listUti4 = utilityTarget[3] * listVariation

    # We define our equation on formal rents: see technical documentation
    # for math formulas
    if options["correct_kappa"] == 1 and options["deprec_land"] == 1:
        dataRent = (
            data_sp["price"] ** (coeff_a)
            * (param["depreciation_rate"]
               + interest_rate)
            / (coeffKappa * coeff_b ** coeff_b * coeff_a ** coeff_a)
            )
    elif options["correct_kappa"] == 1 and options["deprec_land"] == 0:
        dataRent = (
            (interest_rate * data_sp["price"]) ** coeff_a
            * (param["depreciation_rate"]
               + interest_rate) ** coeff_b
            / (coeffKappa * coeff_b ** coeff_b * coeff_a ** coeff_a)
            )
    elif options["correct_kappa"] == 0 and options["deprec_land"] == 1:
        dataRent = (
            data_sp["price"] ** (coeff_a)
            * (param["depreciation_rate"]
               + interest_rate)
            / (coeffKappa * coeff_b ** coeff_b)
            )
    elif options["correct_kappa"] == 0 and options["deprec_land"] == 0:
        dataRent = (
            (data_sp["price"] * interest_rate) ** coeff_a
            * (param["depreciation_rate"]
               + interest_rate) ** coeff_b
            / (coeffKappa * coeff_b ** coeff_b)
            )

    # Note that the above formulas consider that we observe land prices in the
    # data, hence the conversion to housing rents through the zero profit
    # condition for formal private developers. If we observe housing prices
    # instead, we should use to below formula, expressing rents as the annual
    # coupon associated with an infinite bond valued at housing price
    dataRent = data_sp["price"] * interest_rate

    # We import amenity data at the SP level
    amenities_sp = calam.import_exog_amenities(
        path_data, path_precalc_inp, 'SP')
    # We select amenity variables to be used in regressions
    # NB: choice has to do with relevance and exogeneity of variables
    # Choice set is relevant but may only refer to a small subset of locations
    # (train stations are often dysfunctional, for instance)
    variables_regression = [
        'distance_distr_parks', 'distance_ocean', 'distance_ocean_2_4',
        'distance_urban_herit', 'airport_cone2', 'slope_1_5', 'slope_5',
        'distance_biosphere_reserve', 'distance_train']

    # We run the parameter scanning.
    # Note that this may be long to run as it depends on the combination of all
    # inputs: we need to make sure that the estimated parameters fall within
    # the predefined value ranges to exclude corner solutions.
    (parametersScan, scoreScan, parametersAmenitiesScan, modelAmenityScan,
     parametersHousing) = calscan.EstimateParametersByScanning(
         incomeNetOfCommuting, dataRent, data_sp["dwelling_size"],
         data_income_group, selectedSP, amenities_sp, variables_regression,
         listRho, listBeta, listBasicQ, initUti2, listUti3, listUti4, options)

    print(modelAmenityScan.summary())

    # Now we run the optimization algorithm with identified value of the
    # parameters: this corresponds to an interior-point algorithm.
    # Note that this require above scanning process to be well-defined in
    # order to converge to an interior solution: we provide the option of not
    # running it in case this leads to absurd results.

    if options["param_optim"] == 1:

        initBeta = parametersScan[0]
        initBasicQ = parametersScan[1]
        initUti3 = parametersScan[2]
        initUti4 = parametersScan[3]

        (parameters, scoreTot, parametersAmenities, modelAmenity,
         parametersHousing) = calopt.EstimateParametersByOptimization(
             incomeNetOfCommuting, dataRent, data_sp["dwelling_size"],
             data_income_group, selectedSP, amenities_sp, variables_regression,
             listRho, initBeta, initBasicQ, initUti2, initUti3, initUti4,
             options)

        print(modelAmenity.summary())

    # Exporting and saving outputs

    amenities_grid = calam.import_exog_amenities(
        path_data, path_precalc_inp, 'grid')
    predictors_grid = amenities_grid.loc[:, variables_regression]
    predictors_grid = sm.add_constant(predictors_grid)
    # predictors_grid = np.vstack(
    #     [np.ones(predictors_grid.shape[0]),
    #      predictors_grid.T]
    #     ).T

    # NB: we only retain the explained component of the theoretical amenity
    # score
    if options["param_optim"] == 1:
        cal_amenities = np.exp(
            np.nansum(predictors_grid * parametersAmenities, 1))
        calibratedUtility_beta = parameters[0]
        calibratedUtility_q0 = parameters[1]
        calibratedUtility_u3 = parameters[2]
        calibratedUtility_u4 = parameters[3]
    elif options["param_optim"] == 0:
        cal_amenities = np.exp(
            np.nansum(predictors_grid * parametersAmenitiesScan, 1))
        calibratedUtility_beta = parametersScan[0]
        calibratedUtility_q0 = parametersScan[1]
        calibratedUtility_u3 = parametersScan[2]
        calibratedUtility_u4 = parametersScan[3]

    np.save(path_precalc_inp + 'calibratedUtility_beta',
            calibratedUtility_beta)
    np.save(path_precalc_inp + 'calibratedUtility_q0', calibratedUtility_q0)
    np.save(path_precalc_inp + 'calibratedAmenities', cal_amenities)

    return (calibratedUtility_beta, calibratedUtility_q0, cal_amenities,
            calibratedUtility_u3, calibratedUtility_u4)