Source code for calibration.calib_main_func

# -*- coding: utf-8 -*-

import numpy as np
import numpy.matlib
import statsmodels.api as sm
import os
import math
import scipy

import calibration.sub.compute_income as calcmp
import calibration.sub.import_employment_data as calemp
import calibration.sub.estimate_parameters_by_scanning as calscan
import calibration.sub.estimate_parameters_by_optimization as calopt
import calibration.sub.import_amenities as calam


[docs]def estim_construct_func_param(options, param, data_sp, threshold_income_distribution, income_distribution, data_rdp, housing_types_sp, data_number_formal, data_income_group, selected_density, path_data, path_precalc_inp, path_folder): """ Estimate coefficients of housing production function (Cobb-Douglas). This function leverages a partial relation from our general equilibrium model, that is estimated on SP data which does not enter the simulations as an input. More precisely, it combines the expression of the optimal housing supply in the formal private sector with the highest bid condition (see technical documentation for math formulas). Parameters ---------- options : dict Dictionary of default options param : dict Dictionary of default parameters data_sp : DataFrame Table yielding, for each Small Place (1,046), the average dwelling size (in m²), the average land price and annual income level (in rands), the size of unconstrained area for construction (in m²), the total area (in km²), the distance to the city centre (in km), whether or not the location belongs to Mitchells Plain, and the SP code threshold_income_distribution : ndarray(int32) Annual income level (in rands) above which a household is taken as being part of one of the 4 income groups in the model income_distribution : ndarray(uint16, ndim=2) Exogenous number of households in each Small Place (1,046) for each income group in the model (4) data_rdp : DataFrame Table yielding, for each grid cell (24,014), the associated cumulative count of cells with some formal subsidized housing, and the associated area (in m²) dedicated to such housing housing_types_sp : DataFrame Table yielding, for each Small Place (1,046), the number of informal backyards, of informal settlements, and total dwelling units, as well as their (centroid) x and y coordinates data_number_formal : Series Number of formal private housing units considered for each Small Place (1,046) data_income_group : ndarray(float64) Categorical variable indicating, for each Small Place (1,046), the dominant income group (from 0 to 3) selected_density : Series Dummy variable allowing for sample selection across Small Places (1,046) for regressions that are only valid in the formal private housing sector path_data : str Path towards data used in the model path_precalc_inp : str Path for precalcuted input data (calibrated parameters) path_folder : str Path towards the root data folder Returns ------- coeff_b : float64 Calibrated capital elasticity in housing production function coeff_a : float64 Calibrated land elasticity in housing production function coeffKappa : float64 Calibrated scale factor in housing production function """ # We define our outcome variable y = np.log(data_number_formal[selected_density]) # We define our independent variables # Note that we use data_sp["unconstrained_area"] (which is accurate data at # SP level) rather than coeff_land (which is an estimate at grid level) X = np.transpose( np.array([np.ones(len(data_sp["price"][selected_density])), np.log(data_sp["price"][selected_density]), np.log(data_sp["dwelling_size"][selected_density]), np.log(param["max_land_use"] * data_sp["unconstrained_area"][selected_density])]) ) # NB: Our data set for dwelling sizes only provides the average dwelling # size at the Sub-Place level, aggregating formal and informal housing # We run the linear regression modelSpecification = sm.OLS(y, X, missing='drop') model_construction = modelSpecification.fit() print(model_construction.summary()) parametersConstruction = model_construction.params # We export outputs of the model coeff_b = parametersConstruction["x1"] coeff_a = 1 - coeff_b # Scale factor formula comes from zero profit condition combined with # footnote 16 from Pfeiffer et al. (typo in original paper) if options["correct_kappa"] == 1: coeffKappa = ((1 / (coeff_b / coeff_a) ** coeff_b) * np.exp(parametersConstruction["const"])) elif options["correct_kappa"] == 0: coeffKappa = ((1 / (coeff_b) ** coeff_b) * np.exp(parametersConstruction["const"])) try: os.mkdir(path_precalc_inp) except OSError as error: print(error) np.save(path_precalc_inp + 'calibratedHousing_b.npy', coeff_b) np.save(path_precalc_inp + 'calibratedHousing_kappa.npy', coeffKappa) return coeff_b, coeff_a, coeffKappa
[docs]def estim_incomes_and_gravity(param, grid, list_lambda, households_per_income_class, average_income, income_distribution, spline_inflation, spline_fuel, spline_population_income_distribution, spline_income_distribution, path_data, path_precalc_inp, path_precalc_transp, options): """ Estimate incomes per job center and income group, with gravity parameter. This function leverages theoretical formulas from calibration.sub.compute_income and data imported through the calibration.sub.import_employment_data module. It first import transport costs and observed number of commuters per selected job centre and income group, then estimates the associated incomes for a given gravity parameter by minimizing the error over the simulated number of commuters. Then, it selects among a list of scanned values the final value of the gravity parameter (and the associated incomes) by minimizing the error over the distribution of commuters along their residence-workplace distances. Parameters ---------- param : dict Dictionary of default parameters grid : DataFrame Table yielding, for each grid cell (24,014), its x and y (centroid) coordinates, and its distance (in km) to the city centre list_lambda : ndarray(float64) List of values over which to scan for the gravity parameter used in the commuting choice model households_per_income_class : ndarray(float64) Exogenous total number of households per income group (excluding people out of employment, for 4 groups) average_income : ndarray(float64) Average median income for each income group in the model (4) income_distribution : ndarray(uint16, ndim=2) Exogenous number of households in each Small Place (1,046) for each income group in the model (4) spline_inflation : interp1d Linear interpolation for inflation rate (in base 100 relative to baseline year) over the years (baseline year set at 0) spline_fuel : interp1d Linear interpolation for fuel price (in rands per km) over the years (baseline year set at 0) spline_population_income_distribution : interp1d Linear interpolation for total population per income group in the data (12) over the years (baseline year set at 0) spline_income_distribution : interp1d Linear interpolation for median annual income (in rands) per income group in the data (12) over the years (baseline year set at 0) path_data : str Path towards data used in the model path_precalc_inp : str Path for precalcuted input data (calibrated parameters) path_precalc_transp : str Path for precalcuted transport inputs (intermediate outputs from commuting choice model) options : dict Dictionary of default options Returns ------- incomeCentersKeep : ndarray(float64, ndim=2) Calibrated average annual household income (including unemployment) for each income group (4), per grid cell (24,014) lambdaKeep : float64 Calibrated gravity parameter from the commuting choice model cal_avg_income : ndarray(float64) Overall calibrated average income across income groups (4), for validation only scoreKeep : ndarray(float64) Mean ratio of simulated over observed number of commuters per job center (185), collapsed at the income-group level (4): captures the error over our calibrated parameters bhattacharyyaDistances : ndarray(float64) Bhattacharyya distances (measure the similarity of two probability distributions) between the calculated distribution of commuting distances and aggregates from the Transport Survey, for each scanned value of the gravity parameter. This is used as an auxiliary measure to pin down a unique gravity parameter (and associated matrix of incomes), and is only given as an output of the function for reference. """ # We import number of workers in each selected job center. # Note that it is rescaled to match aggregate income distribution in census job_centers = calemp.import_employment_data( households_per_income_class, param, path_data) # We import transport cost data. # Note that we reason at the SP level here. Also note that we are # considering round trips and households made up of two people. (timeOutput, distanceOutput, monetaryCost, costTime ) = calcmp.import_transport_costs( grid, param, 0, households_per_income_class, spline_inflation, spline_fuel, spline_population_income_distribution, spline_income_distribution, path_precalc_inp, path_precalc_transp, 'SP', options) # Note that this is long to run. # Here again, we are considering rescaled income data. (incomeCenters, distanceDistribution, scoreMatrix ) = calcmp.EstimateIncome( param, timeOutput, distanceOutput[:, :, 0], monetaryCost, costTime, job_centers, average_income, income_distribution, list_lambda, options) # Gives aggregate statistics for % of commuters per distance bracket # NB: bracketsDistance = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 200]) # with floor residence-workplace distances in km # (see calibration.sub.compute_income) # NB: we could check differential mobility patterns as a robustness check # by estimating a distinct gravity parameter for each income group if we # get access to a breakdown of residence-workplace distance distribution # per income group # Residence-workplace distance distribution comes from the CoCT's 2013 # Transport Survey data_distance_distribution = np.array( [45.6174222, 18.9010734, 14.9972971, 9.6725616, 5.9425438, 2.5368754, 0.9267125, 0.3591011, 1.0464129]) # Compute accessibility index # NB1: Bhattacharyya distance measures the similarity of two probability # distributions (here, data vs. simulated % of commuters) # NB2: Mahalanobis distance is a particular case of the Bhattacharyya # distance when the standard deviations of the two classes are the same bhattacharyyaDistances = ( - np.log(np.nansum(np.sqrt(data_distance_distribution[:, None] / 100 * distanceDistribution), 0)) ) whichLambda = np.argmin(bhattacharyyaDistances) # Hence, we keep the lambda that minimizes the distance and the associated # income vector lambdaKeep = list_lambda[whichLambda] incomeCentersKeep = incomeCenters[:, :, whichLambda] # We also keep the associated error metric # scoreKeep = scoreMatrix[whichLambda, :] scoreKeep = scoreMatrix # Note that income is set to -inf for job centers and income groups in # which it could not be calibrated np.save(path_precalc_inp + 'incomeCentersKeep.npy', incomeCentersKeep) np.save(path_precalc_inp + 'lambdaKeep.npy', lambdaKeep) # Note that it is unclear whether "average" income from data includes # unemployment or not: a priori, it does for short spells (less than one # year) and should therefore be slightly bigger than calibrated income # (which should reflect all unemployment): this is what we observe in # practice incomeCentersKeep[incomeCentersKeep < 0] = math.nan cal_avg_income = np.nanmean(incomeCentersKeep, 0) return (incomeCentersKeep, lambdaKeep, cal_avg_income, scoreKeep, bhattacharyyaDistances)
[docs]def estim_util_func_param(data_number_formal, data_income_group, housing_types_sp, data_sp, coeff_a, coeff_b, coeffKappa, interest_rate, incomeNetOfCommuting, path_data, path_precalc_inp, options, param): """ Calibrate utility function parameters. This function leverages the following modules: import_amenities, estimate_parameters_by_scanning, and estimate_parameters_by_optimization. As before, we use partial relations coming from our general equilibrium structure (see technical documentation for math formulas). This time, we look at the utility function parameters that maximize a composite likelihood function for the fit on observed amenities, dwelling sizes, and population sorting by income (see calibration.sub.loglikelihood module). We proceed first by scanning over a discrete range of parameter values, then by running a smooth solver taking outputs from scanning as initial values. Parameters ---------- data_number_formal : Series Number of formal private housing units considered for each Small Place (1,046) data_income_group : ndarray(float64) Categorical variable indicating, for each Small Place (1,046), the dominant income group (from 0 to 3) housing_types_sp : DataFrame Table yielding, for each Small Place (1,046), the number of informal backyards, of informal settlements, and total dwelling units, as well as their (centroid) x and y coordinates data_sp : DataFrame Table yielding, for each Small Place (1,046), the average dwelling size (in m²), the average land price and annual income level (in rands), the size of unconstrained area for construction (in m²), the total area (in km²), the distance to the city centre (in km), whether or not the location belongs to Mitchells Plain, and the SP code coeff_a : float64 Calibrated land elasticity in housing production function coeff_b : float64 Calibrated capital elasticity in housing production function coeffKappa : float64 Calibrated scale factor in housing production function interest_rate : float64 Interest rate for the overall economy, corresponding to an average over past years incomeNetOfCommuting : ndarray(float64, ndim=2) Expected annual income net of commuting costs (in rands, for one household), for each geographic unit, by income group (4) path_data : str Path towards data used in the model path_precalc_inp : str Path for precalcuted input data (calibrated parameters) options : dict Dictionary of default options param : dict Dictionary of default parameters Returns ------- calibratedUtility_beta : float64 Calibrated surplus housing elasticity in households' utility function calibratedUtility_q0 : float64 Parametric basic need in housing (in m²). Note that this parameter is not an output of the calibration per se, as it is exogenously set. It is included here for reference as it enters the households' utility function and enters the optimization programme as an input. Note that this could be optimized over (as in Pfeiffer et al.), but only within a narrow range of values to preserve feasibilty of allocations. cal_amenities : ndarray(float64) Calibrated amenity index for each grid cell (24,014): this is not normalized yet. """ # We select in which areas we actually measure the likelihood. # We use a less stringent definition as for the fit on building density # (estimation of construction function parameters), as we are going to # separately add conditions more specific to each regression used in the # process. # NB: There is a trade-off between empirical validity and statistical power selectedSP = ( (data_number_formal > 0.90 * housing_types_sp.total_dwellings_SP_2011) & (data_income_group > 0) ) # We are going to scan over values on which to optimize # As explained in the parameters_and_options module, we consider that # benchmark value from Finlay and Williams (2022) is more robust than # our own estimation for the elasticity of housing demand. We therefore # pin this value down as default. Note that we can define a range of # acceptable values for sensitivity checks. listBeta = scipy.io.loadmat( path_precalc_inp + 'calibratedUtility_beta.mat' )["calibratedUtility_beta"].squeeze() # listBeta = 0.25 # listBeta = np.arange(0.1, 0.41, 0.02) # Again, we pin the value of basic need in housing to the one estimated # in Pfeiffer et al., to serve as a placeholder for later empirical # calibration (see parameters_and_options module) and to reduce the # numerical complexity of the optimization process (that may converge # towards multiple optima) listBasicQ = scipy.io.loadmat( path_precalc_inp + 'calibratedUtility_q0.mat' )["calibratedUtility_q0"].squeeze() # listBasicQ = 4 # Coefficient for spatial autocorrelation (not used) listRho = 0 # Target utility levels used to define amenity score: we take levels close # to what we expect in equilibrium # utilityTarget = np.array([400, 900, 5400, 26000]) utilityTarget = np.array([1200, 4800, 16000, 77000]) # Then, we only allow utilities for the two richest income groups to vary, # for the sake of numerical simplicity and as they will drive most of the # changes in absolute values. Actually, we even exclude the poorest income # group from the analysis, again for the sake of numerical simplicity and # as it will in practice be crowded out of the formal private sector # (which drives most of our identification). # Compared to Pfeiffer et al., we only fit the data moment on exogenous # amenities when pinning down the values of beta and q0 (see definition # of the composite log-likelihood in the estimate_parameters modules). # This is because the amenity score is the only parameter left to # calibrate, and we want it to be predicted in the most acurrate way # by our explanatory covariates. Consequently, the selected values for the # utility levels will not necessarily match the endogenous values we get as # an equilibrium outcome: this is because they are selected so as to # minimize the component of the theoretical amenity score that is left # unexplained by chosen exogenous covariates. Indeed, we will keep the # explained component for the definition of the amenity score used in our # simulations. # Alternatively, we could obtain this score from model inversion (as we do # later with the disamenity factor from living in informal backyards # / settlements), implicitly defining it as a residual that allows the # model to fit the data better. In more intuitive terms, this boils down to # optimizing over the value of the amenity score at the same time as # utility levels when solving the equilibrium (remember that we consider # a closed-city model where total population per income group is exogenous # and utility levels are endogenous). This approach is standard in # quantitative spatial economic models (see Redding and Rossi-Hansberg, # 2017 for more details). However, we choose to model the score explicitly, # defining it a priori as a calibrated parameter, and only solving for # utility levels (and disamenity factors) in equilibrium. # NB: The approach taken in Pfeiffer et al., where the score is calibrated # a priori, but along other data moments, can be seen as an intermediate # between the two approaches presented above. # This approach comes with the risk of a poorer overall fit. However, if # validation shows that model predictions are good enough for use in spite # of this limitation, the approach also brings clear benefits. First, # it prevents the model from overfitting, which increases its external # validity when assessing counterfactuals. Then, the amenity score has a # clear empirical interpretation, which allows to assess more specifically # policies directed towards amenities (translating into a change of values # for some exogenous covariates, that can be direcly mapped to a change in # equilibrium outcomes). listVariation = np.arange(0.5, 1.5, 0.01) initUti2 = utilityTarget[1] listUti3 = utilityTarget[2] * listVariation listUti4 = utilityTarget[3] * listVariation # We define our equation on formal rents: see technical documentation # for math formulas if options["correct_kappa"] == 1 and options["deprec_land"] == 1: dataRent = ( data_sp["price"] ** (coeff_a) * (param["depreciation_rate"] + interest_rate) / (coeffKappa * coeff_b ** coeff_b * coeff_a ** coeff_a) ) elif options["correct_kappa"] == 1 and options["deprec_land"] == 0: dataRent = ( (interest_rate * data_sp["price"]) ** coeff_a * (param["depreciation_rate"] + interest_rate) ** coeff_b / (coeffKappa * coeff_b ** coeff_b * coeff_a ** coeff_a) ) elif options["correct_kappa"] == 0 and options["deprec_land"] == 1: dataRent = ( data_sp["price"] ** (coeff_a) * (param["depreciation_rate"] + interest_rate) / (coeffKappa * coeff_b ** coeff_b) ) elif options["correct_kappa"] == 0 and options["deprec_land"] == 0: dataRent = ( (data_sp["price"] * interest_rate) ** coeff_a * (param["depreciation_rate"] + interest_rate) ** coeff_b / (coeffKappa * coeff_b ** coeff_b) ) # Note that the above formulas consider that we observe land prices in the # data, hence the conversion to housing rents through the zero profit # condition for formal private developers. If we observe housing prices # instead, we should use to below formula, expressing rents as the annual # coupon associated with an infinite bond valued at housing price dataRent = data_sp["price"] * interest_rate # We import amenity data at the SP level amenities_sp = calam.import_exog_amenities( path_data, path_precalc_inp, 'SP') # We select amenity variables to be used in regressions # NB: choice has to do with relevance and exogeneity of variables # Choice set is relevant but may only refer to a small subset of locations # (train stations are often dysfunctional, for instance) variables_regression = [ 'distance_distr_parks', 'distance_ocean', 'distance_ocean_2_4', 'distance_urban_herit', 'airport_cone2', 'slope_1_5', 'slope_5', 'distance_biosphere_reserve', 'distance_train'] # We run the parameter scanning. # Note that this may be long to run as it depends on the combination of all # inputs: we need to make sure that the estimated parameters fall within # the predefined value ranges to exclude corner solutions. (parametersScan, scoreScan, parametersAmenitiesScan, modelAmenityScan, parametersHousing) = calscan.EstimateParametersByScanning( incomeNetOfCommuting, dataRent, data_sp["dwelling_size"], data_income_group, selectedSP, amenities_sp, variables_regression, listRho, listBeta, listBasicQ, initUti2, listUti3, listUti4, options) print(modelAmenityScan.summary()) # Now we run the optimization algorithm with identified value of the # parameters: this corresponds to an interior-point algorithm. # Note that this require above scanning process to be well-defined in # order to converge to an interior solution: we provide the option of not # running it in case this leads to absurd results. if options["param_optim"] == 1: initBeta = parametersScan[0] initBasicQ = parametersScan[1] initUti3 = parametersScan[2] initUti4 = parametersScan[3] (parameters, scoreTot, parametersAmenities, modelAmenity, parametersHousing) = calopt.EstimateParametersByOptimization( incomeNetOfCommuting, dataRent, data_sp["dwelling_size"], data_income_group, selectedSP, amenities_sp, variables_regression, listRho, initBeta, initBasicQ, initUti2, initUti3, initUti4, options) print(modelAmenity.summary()) # Exporting and saving outputs amenities_grid = calam.import_exog_amenities( path_data, path_precalc_inp, 'grid') predictors_grid = amenities_grid.loc[:, variables_regression] predictors_grid = sm.add_constant(predictors_grid) # predictors_grid = np.vstack( # [np.ones(predictors_grid.shape[0]), # predictors_grid.T] # ).T # NB: we only retain the explained component of the theoretical amenity # score if options["param_optim"] == 1: cal_amenities = np.exp( np.nansum(predictors_grid * parametersAmenities, 1)) calibratedUtility_beta = parameters[0] calibratedUtility_q0 = parameters[1] calibratedUtility_u3 = parameters[2] calibratedUtility_u4 = parameters[3] elif options["param_optim"] == 0: cal_amenities = np.exp( np.nansum(predictors_grid * parametersAmenitiesScan, 1)) calibratedUtility_beta = parametersScan[0] calibratedUtility_q0 = parametersScan[1] calibratedUtility_u3 = parametersScan[2] calibratedUtility_u4 = parametersScan[3] np.save(path_precalc_inp + 'calibratedUtility_beta', calibratedUtility_beta) np.save(path_precalc_inp + 'calibratedUtility_q0', calibratedUtility_q0) np.save(path_precalc_inp + 'calibratedAmenities', cal_amenities) return (calibratedUtility_beta, calibratedUtility_q0, cal_amenities, calibratedUtility_u3, calibratedUtility_u4)