Source code for calibration.sub.loglikelihood

# -*- coding: utf-8 -*-
"""



"""

import numpy as np
import statsmodels.api as sm


[docs]def LogLikelihoodModel(X0, Uo2, net_income, groupLivingSpMatrix,
                       dataDwellingSize,
                       selectedDwellingSize, dataRent, selectedRents,
                       predictorsAmenitiesMatrix,
                       tableRegression, variables_regression,
                       CalculateDwellingSize, ComputeLogLikelihood,
                       optionRegression, options):
    """
    Compute composite log-likelihood for model fit given scanned parameters.

    This function computes three separate log-likelihoods that capture the fit
    of the model along distinct data moments. Then, it sums them to give a
    composite log-likelihood that will be maximized to calibrate the utility
    function parameters (as part of scanning or smooth optimization process).
    More precisely, it starts by regressing theoretical values of the log
    amenity index on observed exogenous dummy amenity variables. The first
    log-likelihood is computed based on the value of residuals. Then, it
    defines a second log-likelihood for the fit on income sorting (matching
    the observed dominant income group with the highest bidding income group).
    Finally, it gets the residuals from the log-difference between theoretical
    and observed dwelling sizes, and computes the third log-likelihood from
    there.

    Parameters
    ----------
    X0 : ndarray(float64)
        Set of inputs (namely, surplus housing elasticity, basic need in
        housing, utility levels for income groups 3 and 4) tested for a given
        iteration
    Uo2 : int32
        Target utility level for income group 2
    net_income : ndarray(float64, ndim=2)
        Expected annual income net of commuting costs (in rands, for
        one household), for SP (1,046), by income group excluding the poorest
        (3)
    groupLivingSpMatrix : ndarray(bool, ndim=2)
        Dummy variable indicating, for each income group excluding the poorest
        (3) whether it is dominant in each SP (1,046)
    dataDwellingSize : Series
        Average dwelling size (in m²) for each SP (1,046), from SP data
    selectedDwellingSize : Series
        Dummy variable indicating, for each SP (1,046), whether it is selected
        into the sample used when regressing observed dwelling sizes on their
        theoretical equivalent
    dataRent : Series
        Theoretical average annual rent for formal private housing, computed
        from data on average land prices, for each SP (1,046)
    selectedRents : Series
        Dummy variable indicating, for each SP (1,046), whether it is selected
        into the sample used when estimating the discrete choice logit model
        associated with income sorting (identifying observed rents with highest
        bid-rents from the dominant income group)
    predictorsAmenitiesMatrix : ndarray(float64, ndim=2)
        Values of selected exogenous dummy amenity variables (10, including the
        intercept) in each selected SP (according to selectedRents)
    tableRegression : DataFrame
        Values of all exogenous dummy amenity variables (16, excluding the
        intercept) in each selected SP (according to selectedRents)
    variables_regression : list
        List of labels for selected exogenous dummy amenity variables (9,
        excluding the intercept)
    CalculateDwellingSize : function
        Function defining the relationship between rents and dwelling sizes
        in the formal private sector (see technical documentation)
    ComputeLogLikelihood : function
        Log-likelihood function for a lognormal law of mean 0: we will assume
        that dwelling size and amenity residuals follow such a law
    optionRegression : int
        Option to run GLM (instead of OLS) regression for the estimation of
        exogenous amenity estimates: default is set as zero as GLM is unstable
    options : dict
        Dictionary of default options

    Returns
    -------
    scoreTotal : float64
        Value of the composite log-likelihood for the set of parameters scanned
    scoreAmenities : float64
        Value of the log-likelihood for the fit on exogenous amenities
    scoreDwellingSize : float64
        Value of the log-likelihood for the fit on observed dwelling sizes
    scoreIncomeSorting : float64
        Value of the log-likelihood for the fit on observed income sorting
        (matching observed rents to highest bid-rents from dominant income
        group)
    scoreHousing : float64
        Value of the log-likelihood for the fit on observed housing supply
        / building density: this is not used in this version of the model as
        the relation is already used for the calibration of construction
        function parameters (hence is set equal to zero)
    parametersAmenities : ndarray(float64)
        List of estimates for the impact of exogenous (dummy) amenities on the
        calibrated amenity index (in log-form). In the order: distance to the
        ocean <2km, distance to the ocean between 2 and 4km, slope between
        1 and 5%, slope >5%, being located within the airport cone, distance
        to district parks <2km, distance to biosphere reserve <2km, distance
        to train station <2km, distance to urban heritage site <2km
    modelAmenities : regression.linear_model.RegressionResultsWrapper
        Object summarizing the results of the log-regressions of the
        theoretical amenity index over observed exogenous amenity dummies
    parametersHousing : int
        List of estimates related to the fit of the model on building density
        / housing supply: this is not included in this version of the model
        (vector is set equal to zero) as we already exploit this relation to
        estimate parameters of the construction function (compared to other
        versions where we use construction costs)

    """
    # We extract scanned parameters from input vector
    beta = X0[0]
    basicQ = X0[1]
    Uo = np.array([Uo2, X0[2], X0[3]])

    # %% Error on the amenity index

    # Theoretical formula for log amenity index (from technical documentation)
    # Note that we stay under sample selection from selectedRents variable
    logAmenityIndex = (
        np.log(np.array(Uo)[:, None])
        - np.log(
            (1 - beta) ** (1 - beta) * beta ** beta
            * (net_income[:, selectedRents]
               - basicQ * np.array(dataRent)[None, selectedRents])
            / (np.array(dataRent)[None, selectedRents] ** beta))
        )
    # We select values for dominant income groups only and flatten the array:
    # this allows to select the appropriate net income and utility to identify
    # the regression
    logAmenityIndex = np.nansum(
        logAmenityIndex * groupLivingSpMatrix[:, selectedRents], 0)
    # logAmenityIndex[np.abs(logAmenityIndex.imag) > 0] = np.nan
    # logAmenityIndex[logAmenityIndex == -np.inf] = np.nan

    # We get residuals for the regression of log amenity index on exogenous
    # dummy variables.
    # Note that we identify dummies with logs of original amenity values
    # (normalized to take values between 1 and e): see technical documentation

    # OLS estimation
    if (optionRegression == 0):
        A = predictorsAmenitiesMatrix  # [~np.isnan(logAmenityIndex), :]
        y = (logAmenityIndex[~np.isnan(logAmenityIndex)]).real
        # parametersAmenities, residuals, rank, s = np.linalg.lstsq(A, y,
        #                                                           rcond=None)
        # res = scipy.optimize.lsq_linear(A, y)
        # parametersAmenities = res.x
        # residuals = res.fun
        modelSpecification = sm.OLS(y, A, missing='drop')
        modelAmenities = modelSpecification.fit()
        parametersAmenities = modelAmenities.params
        errorAmenities = modelAmenities.resid_pearson
        # errorAmenities = y - np.nansum(A * parametersAmenities, 1)
        # modelAmenities = 0

    # GLM estimation (unstable)
    elif (optionRegression == 1):
        residu = logAmenityIndex.real
        A = predictorsAmenitiesMatrix[~np.isnan(logAmenityIndex), :]
        y = (logAmenityIndex[~np.isnan(logAmenityIndex)]).real
        parametersAmenities, residuals, rank, s = np.linalg.lstsq(A, y,
                                                                  rcond=None)
        modelSpecification = sm.GLM(
            residu, tableRegression.loc[:, variables_regression])
        modelAmenities = modelSpecification.fit()
        errorAmenities = modelAmenities.resid_pearson

    # Residuals follow a log-normal law, hence the associated log-likelihood
    # from ComputeLogLikelihood function (see math appendix)
    scoreAmenities = ComputeLogLikelihood(
        np.sqrt(np.nansum(errorAmenities ** 2)
                / np.nansum(~np.isnan(errorAmenities))),
        errorAmenities)

    # %% Error on income sorting

    # We start by defining input variables
    utility_over_amenity = Uo[:, None] / np.exp(logAmenityIndex[None, :])
    net_income_select = net_income[:, selectedRents]

    # We then apply formula from technical documentation
    bidRents = (
        (net_income_select * (1 - beta) ** (1 - beta) * beta ** beta)
        / (utility_over_amenity
            - basicQ * (1 - beta) ** (1 - beta) * beta ** beta)
        )

    # We select SPs where at least some income group makes a positive bid
    selectedBidRents = (np.nansum(bidRents, 0) > 0)
    # We again select the dominant income group
    incomeGroupSelectedRents = groupLivingSpMatrix[:, selectedRents]

    # We create a function for the minus log-likelihood of income sorting:
    # see technical documentation for math formula. We consider minus form
    # to be able to use scipy.optimize.minimize() function. We will then return
    # minus form for the minimum obtained to recover the maximum value of the
    # log-likelihood (across scale factors).

    minusLogLikIncomeSorting = (
        lambda scaleParam:
            - np.nansum(np.nansum(
                bidRents[:, selectedBidRents] / scaleParam
                * incomeGroupSelectedRents[:, selectedBidRents], 0))
            + np.nansum(np.log(np.nansum(
                np.exp(bidRents[:, selectedBidRents] / scaleParam),
                0)))
            )

    # Actually, since we cannot identify the gravity parameter, we pin it at
    # 10, which allows to get a score of the same order of magnitude as for
    # the fit on amenities. It is also a realistic and conservative guess:
    # the order of magnitude is the same as for the gravity parameter from the
    # commuting choice model, and it is low enough (given the function
    # definition, the higher the parameter, the smaller the output).
    # Also note that the improvement is marginal for higher orders of magnitude

    # We keep the optimization in comments for reference, in case we want to be
    # less conservative about the value of the log-likelihood

    scoreIncomeSorting = - minusLogLikIncomeSorting(10)

    # bnds = {(0, 10**10)}
    # initScale = 10**5
    # res = scipy.optimize.minimize(
    #     minusLogLikIncomeSorting, initScale, bounds=bnds,
    #     options={'maxiter': 100, 'disp': False})
    # optiminusLogLikIncomeSorting = res.fun
    # exitFlag = res.success
    # print(exitFlag)

    # %% Error on dwelling sizes

    # We get theoretical values for market rents that we identify to highest
    # bid-rents from dominant income group
    # NB: As bid rents are already defined for sample of SPs selected under
    # selectedRents, we need to take the subsection
    # selectedDwellingSize[selectedRents] to operate another selection based on
    # selectedDwellingSize while respecting array dimensions
    simulatedRents = np.nansum(
        bidRents[:, selectedDwellingSize[selectedRents]]
        * groupLivingSpMatrix[:, selectedDwellingSize],
        0)

    # We get theoretical values for dwelling size based on pre-defined function
    # (see technical documentation for math formula), leveraging the above
    # definition of simulatedRents
    dwellingSize = CalculateDwellingSize(
        beta,
        basicQ,
        np.nansum(net_income[:, selectedDwellingSize]
                  * groupLivingSpMatrix[:, selectedDwellingSize], 0),
        simulatedRents)

    # The (log) error on observed dwelling sizes is directly obtain by taking
    # the log-difference with the theoretical counterpart (staying under the
    # same sample selection)
    errorDwellingSize = (
        np.log(dwellingSize)
        - np.log(dataDwellingSize[selectedDwellingSize])
        )

    # Residuals follow a log-normal law, hence the associated log-likelihood
    # from ComputeLogLikelihood function (see math appendix)
    scoreDwellingSize = ComputeLogLikelihood(
        np.sqrt(np.nansum(errorDwellingSize ** 2)
                / np.nansum(~np.isnan(errorDwellingSize))),
        errorDwellingSize)

    # %% Total

    # The sum of logs is the same as the log of a product, hence we can define
    # our composite log-likelihood function as the sum of our separate
    # log-likelihoods

    scoreTotal = scoreAmenities + scoreDwellingSize + scoreIncomeSorting
    # scoreTotal = scoreAmenities

    # We may also include a measure of the fit for housing supply / household
    # density, which has not been retained in this version of the model, as
    # the underlying relation is already used to calibrate parameters of the
    # construction function.
    # NB: We still define the variables that we set equal to zero to serve as
    # placeholders
    scoreHousing = 0
    parametersHousing = 0

    return (scoreTotal, scoreAmenities, scoreDwellingSize, scoreIncomeSorting,
            scoreHousing, parametersAmenities, modelAmenities,
            parametersHousing)