Quickstart

Installing the Package

Install the package using pip:

$ pip install pycausalgps

See the Setting Up Environment for more details.

Generating Synthetic Data

The package provides a function to generate synthetic data.

>>> from pycausalgps.base.utils import generate_syn_pop
>>> data = generate_syn_pop(sample_size=1000,
                            seed_val=456,
                            outcome_sd=0.25,
                            gps_spec=1,
                            cova_spec=2)

Estimating GPS values

Two approaches are provided to estimate GPS values. These approaches are different based on the type of gps_denisty specified. These approaches are normal and kernel.

Example of normal approach:

from pycausalgps.base.utils import generate_syn_pop
from pycausalgps.base import GeneralizedPropensityScore

params = {"gps_density": "normal",
      "exposure_column": "treat",
      "covariate_column_num": ["cf1",
                               "cf2",
                               "cf3",
                               "cf4",
                               "cf6"],
      "covariate_column_cat": ["cf5"],
      "libs":{
              "xgboost":{
                         "n_estimators": 100,
                         "max_depth": 3,
                         "learning_rate": 0.1,
                         "test_rate": 0.2,
                         "random_state": 42
                         }
                     }
}
data = generate_syn_pop(sample_size=1000,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps = GeneralizedPropensityScore(data, params)

Example of kernel approach:

from pycausalgps.base.utils import generate_syn_pop
from pycausalgps.base import GeneralizedPropensityScore

params = {"gps_density": "kernel",
      "exposure_column": "treat",
      "covariate_column_num": ["cf1",
                               "cf2",
                               "cf3",
                               "cf4",
                               "cf6"],
      "covariate_column_cat": ["cf5"],
      "libs":{
              "xgboost":{
                         "n_estimators": 100,
                         "max_depth": 3,
                         "learning_rate": 0.1,
                         "test_rate": 0.2,
                         "random_state": 42
                         }
                     }
}
data = generate_syn_pop(sample_size=1000,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps = GeneralizedPropensityScore(data, params)

Generating Pseudo-population

There are two implemented methdos to generate pseudo-population. These methods are: weighting and matching.

Example of weighting approach:

data = generate_syn_pop(sample_size=1000,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps_params = {"gps_density": "normal",
              "exposure_column": "treat",
              "covariate_column_num": ["cf1",
                                       "cf2",
                                       "cf3",
                                       "cf4",
                                       "cf6"],
              "covariate_column_cat": ["cf5"],
              "libs":{
                      "xgboost":{
                             "n_estimators": 100,
                             "max_depth": 3,
                             "learning_rate": 0.1,
                             "test_rate": 0.2,
                             "random_state": 42,
                             "n_jobs": 12
                             }
                         }
}

gps = GeneralizedPropensityScore(data, gps_params)

results = gps.get_results()
gps_data = {
    'data' : results.get("data"),
    'gps_density' : results.get("gps_density")
    'gps_minmax': results.get("gps_minmax")
}

pspop_params = {"approach" : "weighting",
                "exposure_column": "treat",
                "covariate_column_num": ["cf1",
                                         "cf2",
                                         "cf3",
                                         "cf4",
                                         "cf6"],
                "covariate_column_cat": ["cf5"],}

pspop = PseudoPopulation(data=data,
                         gps_data=gps_data,
                         params=pspop_params)

Example of matching approach:

from pycausalgps.base.utils import generate_syn_pop
from pycausalgps.gps import GeneralizedPropensityScore
from pycausalgps.pseudo_population import PseudoPopulation


# matching
gps_params = {"gps_density": "normal",
              "exposure_column": "treat",
              "covariate_column_num": ["cf1",
                                       "cf2",
                                       "cf3",
                                       "cf4",
                                       "cf6"],
              "covariate_column_cat": ["cf5"],
              "libs":{
                      "xgboost":{
                             "n_estimators": 100,
                             "max_depth": 3,
                             "learning_rate": 0.1,
                             "test_rate": 0.2,
                             "random_state": 42,
                             "n_jobs": 12
                             }
                         }
}

data = generate_syn_pop(sample_size=500,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps = GeneralizedPropensityScore(data, gps_params)

results = gps.get_results()
gps_data = {
    'data' : results.get("data"),
    'gps_density' : results.get("gps_density"),
    'gps_minmax': results.get("gps_minmax")
}



pspop_params = {"approach" : "matching",
                "exposure_column": "treat",
                "covariate_column_num": ["cf1",
                                         "cf2",
                                         "cf3",
                                         "cf4",
                                         "cf6"],
                "covariate_column_cat": ["cf5"],
                "control_params": {"caliper": 1.0,
                                   "scale": 0.5,
                                   "dist_measure": "l1",
                                   "bin_seq": None},
                "run_params": {"n_thread": 12,
                               "chunk_size": 500},
}

pspop = PseudoPopulation(data=data,
                         gps_data=gps_data,
                         params=pspop_params)

Note

Note that the matching approach is computationally expensive and internally uses multiprocessing. If you encounter any runtime error, try put the code inside a if __name__ == ‘__main__’: block.

Exposure Response Function

The package provides a function to estimate exposure response function. There are three implemented methods to estimate exposure response function. These methods are: parametric, semiparametric and nonparametric.

Example of parametric approach:

The parametric approach estimates the hazard ratios using a parametric regression model. In the backend we us gnm R package to implement generalized additive models.

import pandas as pd

from pycausalgps.base.utils import generate_syn_pop
from pycausalgps.gps import GeneralizedPropensityScore
from pycausalgps.pseudo_population import PseudoPopulation
from pycausalgps.rscripts.rfunctions import estimate_pmetric_erf

gps_params = {"gps_density": "normal",
          "exposure_column": "treat",
          "covariate_column_num": ["cf1",
                                   "cf2",
                                   "cf3",
                                   "cf4",
                                   "cf6"],
          "covariate_column_cat": ["cf5"],
          "libs":{
                  "xgboost":{
                         "n_estimators": 100,
                         "max_depth": 3,
                         "learning_rate": 0.1,
                         "test_rate": 0.2,
                         "random_state": 42
                         }
                     }
}

data = generate_syn_pop(sample_size=200,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps = GeneralizedPropensityScore(data, gps_params)

results = gps.get_results()
gps_data = {
    'data' : results.get("data"),
    'gps_density' : results.get("gps_density")
}

pspop_params = {"approach" : "weighting",
            "exposure_column": "treat",
            "covariate_column_num": ["cf1",
                                     "cf2",
                                     "cf3",
                                     "cf4",
                                     "cf6"],
            "covariate_column_cat": ["cf5"],}

ps_pop = PseudoPopulation(data=data,
                          gps_data=gps_data,
                          params=pspop_params)

ps_results = ps_pop.get_results()
erf_pmetric = estimate_pmetric_erf(formula="Y ~ treat",
                                   family="gaussian",
                                   data = ps_results.get("data"))

Example of semiparametric approach:

The semiparametric approach estimates the smoothed exposure-response function using generalized additive model with splines. In the backend we us gam R package to implement generalized additive models.

# Code from pmetric approach
from pycausalgps.rscripts.rfunctions import estimate_semipmetric_erf

erf_pmetric = estimate_semipmetric_erf(formula="Y ~ treat",
                                       family="gaussian",
                                       data = ps_results.get("data"))

Example of nonparametric approach:

The nonparametric approach estimates the smoothed exposure-response function using kernel smoothing approach. In the backend we us locpol R package to implemetn locpol polynomial fitting with a kernel weight. A data-driven bandwidth selection is implemented.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycausalgps.base.utils import generate_syn_pop
from pycausalgps.gps import GeneralizedPropensityScore
from pycausalgps.pseudo_population import PseudoPopulation
from pycausalgps.erf_helper import estimate_npmetric_erf

gps_params = {"gps_density": "normal",
      "exposure_column": "treat",
      "covariate_column_num": ["cf1",
                               "cf2",
                               "cf3",
                               "cf4",
                               "cf6"],
      "covariate_column_cat": ["cf5"],
      "libs":{
              "xgboost":{
                     "n_estimators": 100,
                     "max_depth": 3,
                     "learning_rate": 0.1,
                     "test_rate": 0.2,
                     "random_state": 42
                     }
                 }
}

data = generate_syn_pop(sample_size=1000,
                        seed_val=456,
                        outcome_sd=0.25,
                        gps_spec=1,
                        cova_spec=2)

gps = GeneralizedPropensityScore(data, gps_params)

results = gps.get_results()

gps_data = {
'data' : results.get("data"),
'gps_density' : results.get("gps_density")
}

pspop_params = {"approach" : "weighting",
            "exposure_column": "treat",
            "covariate_column_num": ["cf1",
                                    "cf2",
                                    "cf3",
                                    "cf4",
                                    "cf6"],
            "covariate_column_cat": ["cf5"],}

ps_pop = PseudoPopulation(data=data, gps_data=gps_data, params=pspop_params)
ps_results = ps_pop.get_results()

m_Y = ps_results.get("data")["Y"].to_numpy()
m_w = ps_results.get("data")["treat"].to_numpy()
counter_weight = ps_results.get("data")["counter_weight"].to_numpy()
bw_seq = np.arange(0.1, 4.0, 0.01)
w_vals = np.arange(1, 20, 0.2)
nthread = 12

erf_pmetric = estimate_npmetric_erf(m_Y, m_w, counter_weight,
                                bw_seq, w_vals, nthread)