Calibration

Instance of:

AKA: Bool

Distinct from:

English: Calibration is a post-processing operation of the predicted probabilities to match them to the true probability of the target distribution. A classifier for example may output predictions that correctly rank order options or reflect the classifiers relative confidence between two options, \(P(Dog|X)=0.9\), \(P(Cat|X)=0.2\), which would be useful for assigning a final thresholded label but not for estimating the true distribution of dogs and cats in the data.

Formalization:

\[ \]

Cites: Wikipedia) ; Wikidata

Code

Imports and spin up toy data objects and databases.
toy_vector_numeric <- c(1,2,3,4,5)
toy_vector_character <- c('a','b','c','d','e')
toy_matrix <- matrix(1:9, nrow=3,ncol=3)
toy_list <- list('a','1',T,c('red','green'))
toy_df <- data.frame(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))

toy_dirty_df <- data.frame(id=c('','NA','NaN','inf'), y=c(1,2,3,4), x= c(0,NA,NaN,Inf)) #can't explicitly include NULL 

library(data.table)
toy_dt <- data.table(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))
toy_dirty_dt <- as.data.table(toy_dirty_df)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()   masks data.table::between()
✖ dplyr::filter()    masks stats::filter()
✖ dplyr::first()     masks data.table::first()
✖ dplyr::lag()       masks stats::lag()
✖ dplyr::last()      masks data.table::last()
✖ purrr::transpose() masks data.table::transpose()
library(arrow)

Attaching package: 'arrow'

The following object is masked from 'package:utils':

    timestamp
import numpy as np
toy_vector_numeric = np.array([1,2,3,4,5])
toy_vector_character = np.array(['a','b','c','d','e'])
toy_list = ['a','1',True,['red','green']]
toy_dictionary = { 'a':1 , 'b':2, 'c':3}

from jax import numpy as jnp
toy_vector_numeric_jax = jnp.array([1,2,3,4,5])
#toy_vector_character_jax = jnp.array(['a','b','c','d','e']) #only numeric is allowed in jax
WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
import pandas as pd
toy_df = pd.DataFrame(data={'id': ['unit1','unit2','unit3'], 'y': [1, 2, 3], 'x': [3, 2, 1]})

import torch

import tensorflow as tf

import pyarrow as pa
library(DBI)
# Create an ephemeral in-memory RSQLite database
#con <- dbConnect(RSQLite::SQLite(), dbname = ":memory:")
#dbListTables(con)
#dbWriteTable(con, "mtcars", mtcars)
#dbListTables(con)

#Configuration failed because libpq was not found. Try installing:
#* deb: libpq-dev libssl-dev (Debian, Ubuntu, etc)
#install.packages('RPostgres')
#remotes::install_github("r-dbi/RPostgres")
#Took forever because my file permissions were broken
#pg_lsclusters
require(RPostgres)
# Connect to the default postgres database
#I had to follow these instructions and create both a username and database that matched my ubuntu name
#https://www.digitalocean.com/community/tutorials/how-to-install-postgresql-on-ubuntu-20-04-quickstart
con_Postgres <- dbConnect(RPostgres::Postgres())

DROP TABLE IF EXISTS toy_df;

CREATE TABLE IF NOT EXISTS toy_df (
  id varchar(5),
    y INTEGER,
    x INTEGER
);

INSERT INTO toy_df (id, y, x)
VALUES
    ('unit1',1,3),
    ('unit2',2,2),
    ('unit3',3,1);
    
#install.packages("duckdb")
library("DBI")
con_duckdb = dbConnect(duckdb::duckdb(), ":memory:")

#pip install duckdb==0.6.0
import duckdb
con_duckdb = duckdb.connect()

0.1 R

Base

Tidyverse

DataTable

Arrow

0.2 Python

0.2.0.1 3.x / math/ statistics

0.2.0.2 NumPy / SciPy / scikit-learn

0.2.0.3 Pandas

0.3 Jax

0.4 Numpyro

0.5 Stan

0.6 Torch

0.7 Tensorflow

0.8 PostgreSQL

0.9 DuckDB