Filter

Instance of: Higher-order function

AKA: Subset

Distinct from:

English:

Formalization:

\[ \]

Cites: Wikipedia ; Wikidata

Code

Imports and spin up toy data objects and databases.
toy_vector_numeric <- c(1,2,3,4,5)
toy_vector_character <- c('a','b','c','d','e')
toy_matrix <- matrix(1:9, nrow=3,ncol=3)
toy_list <- list('a','1',T,c('red','green'))
toy_df <- data.frame(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))

toy_dirty_df <- data.frame(id=c('','NA','NaN','inf'), y=c(1,2,3,4), x= c(0,NA,NaN,Inf)) #can't explicitly include NULL 

library(data.table)
toy_dt <- data.table(id=c('unit1','unit2','unit3'), y=c(1,2,3), x= c(3,2,1))
toy_dirty_dt <- as.data.table(toy_dirty_df)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()   masks data.table::between()
✖ dplyr::filter()    masks stats::filter()
✖ dplyr::first()     masks data.table::first()
✖ dplyr::lag()       masks stats::lag()
✖ dplyr::last()      masks data.table::last()
✖ purrr::transpose() masks data.table::transpose()
library(arrow)

Attaching package: 'arrow'

The following object is masked from 'package:utils':

    timestamp
import numpy as np
toy_vector_numeric = np.array([1,2,3,4,5])
toy_vector_character = np.array(['a','b','c','d','e'])
toy_list = ['a','1',True,['red','green']]
toy_dictionary = { 'a':1 , 'b':2, 'c':3}

from jax import numpy as jnp
toy_vector_numeric_jax = jnp.array([1,2,3,4,5])
#toy_vector_character_jax = jnp.array(['a','b','c','d','e']) #only numeric is allowed in jax
WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
import pandas as pd
toy_df = pd.DataFrame(data={'id': ['unit1','unit2','unit3'], 'y': [1, 2, 3], 'x': [3, 2, 1]})

import torch

import tensorflow as tf

import pyarrow as pa
library(DBI)
# Create an ephemeral in-memory RSQLite database
#con <- dbConnect(RSQLite::SQLite(), dbname = ":memory:")
#dbListTables(con)
#dbWriteTable(con, "mtcars", mtcars)
#dbListTables(con)

#Configuration failed because libpq was not found. Try installing:
#* deb: libpq-dev libssl-dev (Debian, Ubuntu, etc)
#install.packages('RPostgres')
#remotes::install_github("r-dbi/RPostgres")
#Took forever because my file permissions were broken
#pg_lsclusters
require(RPostgres)
# Connect to the default postgres database
#I had to follow these instructions and create both a username and database that matched my ubuntu name
#https://www.digitalocean.com/community/tutorials/how-to-install-postgresql-on-ubuntu-20-04-quickstart
con_Postgres <- dbConnect(RPostgres::Postgres())

DROP TABLE IF EXISTS toy_df;

CREATE TABLE IF NOT EXISTS toy_df (
  id varchar(5),
    y INTEGER,
    x INTEGER
);

INSERT INTO toy_df (id, y, x)
VALUES
    ('unit1',1,3),
    ('unit2',2,2),
    ('unit3',3,1);
    
#install.packages("duckdb")
library("DBI")
con_duckdb = dbConnect(duckdb::duckdb(), ":memory:")
#pip install duckdb==0.6.0
import duckdb
con_duckdb = duckdb.connect()

0.1 R

Base

subset: Subsetting Vectors, Matrices and Data Frames

toy_vector_numeric[toy_vector_numeric > 1]
[1] 2 3 4 5
toy_matrix[toy_matrix>5]
[1] 6 7 8 9
toy_df |> subset(x>1)
     id y x
1 unit1 1 3
2 unit2 2 2
toy_df[toy_df$x>1,]
     id y x
1 unit1 1 3
2 unit2 2 2

Dplyr

Subset rows using column values

toy_df %>% dplyr::filter(x>1)
     id y x
1 unit1 1 3
2 unit2 2 2

DataTable

Subsetting Rows

toy_dt |> subset(x>1)
      id y x
1: unit1 1 3
2: unit2 2 2
toy_dt[x>1,,]
      id y x
1: unit1 1 3
2: unit2 2 2

0.2 Python

How to Filter List Elements in Python filter(function, iterable, /)


filtered = []
for element in toy_list:
  if type(element)==type([]):
    filtered.append(element)
filtered
[['red', 'green']]

Python Filter()


def fun(element):
  return( type(element)==type([]) )
filtered = filter(fun, toy_list)
type(filtered)
<class 'filter'>
list(filtered)
[['red', 'green']]

#filtered = filter(  lamda element : type(element)==type([]) , toy_list) #I don't know why this example fails but the next works
#type(filtered)
#list(filtered)

scores = [70, 60, 80, 90, 50]
filtered = filter(lambda score: score >= 70, scores)
print(list(filtered))
[70, 80, 90]

Filtering Elements in List Comprehensions

[element for element in toy_list if type(element)==type([])]
[['red', 'green']]

filter items in a python dictionary where keys contain a specific string

{k:v for k,v in toy_dictionary.items() if v>1}
{'b': 2, 'c': 3}
# filter by key
dict(filter(lambda e : e[0]=='a', toy_dictionary.items() ) ) 
# filter by value
{'a': 1}
dict(filter(lambda e : e[1]>1, toy_dictionary.items() ) ) 
{'b': 2, 'c': 3}

Filtering (reducing) a NumPy Array

toy_vector_numeric[toy_vector_numeric>1]
array([2, 3, 4, 5])

Filter pandas Dataframes #This actually fitlers on row and column names not values pandas.DataFrame.filter

toy_df[toy_df['x']>1]
      id  y  x
0  unit1  1  3
1  unit2  2  2

pandas.DataFrame.query

toy_df.query('x > 1')
      id  y  x
0  unit1  1  3
1  unit2  2  2

0.3 PostgreSQL

PostgreSQL WHERE


SELECT  * FROM  toy_df
where x>1;
2 records
id y x
unit1 1 3
unit2 2 2

0.4 Torch

import torch