NaN (Not a Number)

Instance of:

AKA: Not a Number

Distinct from:

English: NaN is a numeric data type interpretable as a value that is undefined or unrepresentable. Dividing 0 by 0 for instance, is not defined, and produces a NaN value in most floating point systems.

Formalization:

\[ \]

Cites: NaN ; Wikidata Q858684

R null values: NULL, NA, NaN, Inf

Working with NULL, NA, and NaN

Code

Imports and spin up toy data objects and databases.
toy_vector_numeric <- c(1,2,3,4,5)
toy_vector_character <- c('a','b','c','d','e')
toy_matrix <- matrix(1:9, nrow=3,ncol=3)
toy_list <- list('a','1',T,c('red','green'))
toy_df <- data.frame(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))

toy_dirty_df <- data.frame(id=c('','NA','NaN','inf'), y=c(1,2,3,4), x= c(0,NA,NaN,Inf)) #can't explicitly include NULL 

library(data.table)
toy_dt <- data.table(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))
toy_dirty_dt <- as.data.table(toy_dirty_df)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()   masks data.table::between()
✖ dplyr::filter()    masks stats::filter()
✖ dplyr::first()     masks data.table::first()
✖ dplyr::lag()       masks stats::lag()
✖ dplyr::last()      masks data.table::last()
✖ purrr::transpose() masks data.table::transpose()
library(arrow)

Attaching package: 'arrow'

The following object is masked from 'package:utils':

    timestamp
import numpy as np
toy_vector_numeric = np.array([1,2,3,4,5])
toy_vector_character = np.array(['a','b','c','d','e'])
toy_list = ['a','1',True,['red','green']]
toy_dictionary = { 'a':1 , 'b':2, 'c':3}

from jax import numpy as jnp
toy_vector_numeric_jax = jnp.array([1,2,3,4,5])
#toy_vector_character_jax = jnp.array(['a','b','c','d','e']) #only numeric is allowed in jax
WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
import pandas as pd
toy_df = pd.DataFrame(data={'id': ['unit1','unit2','unit3'], 'y': [1, 2, 3], 'x': [3, 2, 1]})

import torch

import tensorflow as tf

import pyarrow as pa
library(DBI)
# Create an ephemeral in-memory RSQLite database
#con <- dbConnect(RSQLite::SQLite(), dbname = ":memory:")
#dbListTables(con)
#dbWriteTable(con, "mtcars", mtcars)
#dbListTables(con)

#Configuration failed because libpq was not found. Try installing:
#* deb: libpq-dev libssl-dev (Debian, Ubuntu, etc)
#install.packages('RPostgres')
#remotes::install_github("r-dbi/RPostgres")
#Took forever because my file permissions were broken
#pg_lsclusters
require(RPostgres)
# Connect to the default postgres database
#I had to follow these instructions and create both a username and database that matched my ubuntu name
#https://www.digitalocean.com/community/tutorials/how-to-install-postgresql-on-ubuntu-20-04-quickstart
con_Postgres <- dbConnect(RPostgres::Postgres())

DROP TABLE IF EXISTS toy_df;

CREATE TABLE IF NOT EXISTS toy_df (
  id varchar(5),
    y INTEGER,
    x INTEGER
);

INSERT INTO toy_df (id, y, x)
VALUES
    ('unit1',1,3),
    ('unit2',2,2),
    ('unit3',3,1);
    
#install.packages("duckdb")
library("DBI")
con_duckdb = dbConnect(duckdb::duckdb(), ":memory:")
#pip install duckdb==0.6.0
import duckdb
con_duckdb = duckdb.connect()

0.1 R

Base

Finite, Infinite and NaN Numbers

“Do not test equality to NaN, or even use identical, since systems typically have many different NaN values. One of these is used for the numeric missing value NA, and is.nan is false for that value. A complex number is regarded as NaN if either the real or imaginary part is NaN but not NA. All elements of logical, integer and raw vectors are considered not to be NaN.”

NaN
[1] NaN
NaN==NaN
[1] NA
identical(NaN,NaN)
[1] TRUE
identical(NaN,0 / 0)
[1] TRUE
pi / 0 ## = Inf a non-zero number divided by zero creates infinity
[1] Inf
0 / 0  ## =  NaN
[1] NaN
1/0 + 1/0 # Inf
[1] Inf
1/0 - 1/0 # NaN
[1] NaN
sin(NaN)
[1] NaN
is.finite(NaN)
[1] FALSE
is.infinite(NaN)
[1] FALSE
is.nan(NaN)
[1] TRUE
is.null(NaN)
[1] FALSE
is.na(NaN)
[1] TRUE

Tidyverse

Is object an empty vector or NULL?

is_empty(NaN)
[1] FALSE
is_empty(list(NaN))
[1] FALSE
toy_dirty_df
   id y   x
1     1   0
2  NA 2  NA
3 NaN 3 NaN
4 inf 4 Inf
toy_dirty_df %>% replace_na(list(x = 999 ))
   id y   x
1     1   0
2  NA 2 999
3 NaN 3 999
4 inf 4 Inf
toy_dirty_df %>% fill(x)
   id y   x
1     1   0
2  NA 2   0
3 NaN 3   0
4 inf 4 Inf
toy_dirty_df %>% filter(x==0)
  id y x
1    1 0
toy_dirty_df %>% filter(x!=0)
   id y   x
1 inf 4 Inf
toy_dirty_df %>% filter(!x %in% 0)
   id y   x
1  NA 2  NA
2 NaN 3 NaN
3 inf 4 Inf
toy_dirty_df %>% filter(!is.finite(x))
   id y   x
1  NA 2  NA
2 NaN 3 NaN
3 inf 4 Inf
toy_dirty_df %>% mutate(z=coalesce(x,y ))
   id y   x   z
1     1   0   0
2  NA 2  NA   2
3 NaN 3 NaN   3
4 inf 4 Inf Inf

DataTable

toy_clean_dt=toy_dirty_dt[,x:=nafill(x, type="const", fill=999, nan=NaN),]
print(toy_clean_dt)
    id y   x
1:     1   0
2:  NA 2 999
3: NaN 3 NaN
4: inf 4 Inf
toy_clean_dt=toy_dirty_dt[,x:=nafill(x, type="const", fill=999, nan=NA),]
print(toy_clean_dt)
    id y   x
1:     1   0
2:  NA 2 999
3: NaN 3 999
4: inf 4 Inf

Arrow

Apache Arrow data types

0.2 Python

0.2.0.1 3.x / math/ statistics


float("nan")
nan
float("Nan")
nan
float("NaN")
nan
float("NAN")
nan
import math

math.nan
nan
print(math.isnan(math.nan))
True
print(math.nan==math.nan)
False

NaNs don’t have a length in python.


try:
  print(len(math.nan)) #None has no length
except Exception as ex:
  print(ex)
object of type 'float' has no len()

0.2.0.2 NumPy / SciPy / scikit-learn


np.array([[ None,  None],
        [ None,  None]])
array([[None, None],
       [None, None]], dtype=object)
np.array([[ None,  None],
        [ None,  None]], dtype=bool)
array([[False, False],
       [False, False]])

0.2.0.3 Pandas


pd.Series([None, None])
0    None
1    None
dtype: object
pd.Series([None, None]).isnull()
0    True
1    True
dtype: bool
pd.Series([1, None]).notnull()
0     True
1    False
dtype: bool

Arrow

Type Metadata

“None values and NAN handling

As mentioned in the above section, the Python object None is always converted to an Arrow null element on the conversion to pyarrow.Array. For the float NaN value which is either represented by the Python object float(‘nan’) or numpy.nan we normally convert it to a valid float value during the conversion. If an integer input is supplied to pyarrow.array that contains np.nan, ValueError is raised.

To handle better compatibility with Pandas, we support interpreting NaN values as null elements. This is enabled automatically on all from_pandas function and can be enable on the other conversion functions by passing from_pandas=True as a function parameter.”

0.3 Jax


jnp.array([[ None,  None],
         [ None,  None]])
DeviceArray([[nan, nan],
             [nan, nan]], dtype=float32)

0.4 Numpyro

Uses jax’s boolean array

0.5 Stan

0.6 Torch

TORCH.NAN_TO_NUM


torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
tensor([   nan,    inf,   -inf, 3.1400])
try:
  print(torch.tensor([None])) #Fails
except Exception as ex:
  print(ex)
  
Could not infer dtype of NoneType

0.7 Tensorflow


tf.constant([5.0, np.nan, 6.8, np.nan, np.inf])
<tf.Tensor: shape=(5,), dtype=float32, numpy=array([5. , nan, 6.8, nan, inf], dtype=float32)>
tf.math.is_nan(tf.constant([5.0, np.nan, 6.8, np.nan, np.inf]))
<tf.Tensor: shape=(5,), dtype=bool, numpy=array([False,  True, False,  True, False])>
try:
  print(tf.constant([None,None], dtype=tf.bool))
except Exception as ex:
  print(ex)
Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.

0.8 PostgreSQL


DROP TABLE IF EXISTS null_df;

CREATE TABLE IF NOT EXISTS null_df (
example_null boolean,
example_notnull boolean NOT NULL
);

Error: Failed to fetch row: ERROR: null value in column “example_notnull” of relation “null_df” violates not-null constraint DETAIL: Failing row contains (t, null).


INSERT INTO null_df (example_null, example_notnull)
VALUES
(NULL, False),
(TRUE, NULL);
;

INSERT INTO null_df (example_null, example_notnull)
VALUES
(NULL, False),
(TRUE, TRUE);
;

SELECT * from null_df;
0 records
example_null example_notnull

0.9 DuckDB

Boolean Type


DROP TABLE IF EXISTS null_df;

CREATE TABLE IF NOT EXISTS null_df (
example_null boolean,
example_notnull boolean NOT NULL
);

Error: rapi_execute: Failed to run query Error: Constraint Error: NOT NULL constraint failed: null_df.example_notnull Failed to execute SQL chunk


INSERT INTO null_df (example_null, example_notnull)
VALUES
(NULL, False),
(TRUE, NULL);
;

INSERT INTO null_df (example_null, example_notnull)
VALUES
(NULL, False),
(TRUE, TRUE);
;

SELECT * from null_df;
0 records
example_null example_notnull