Preparación de Datos con R

Por Jose R. Zapata

En este notebook se encuentra temas de preparación de datos con R como: datos faltantes, datos duplicados, datos atípicos, normalización, estandarización, codificación de variables categóricas, etc.

El desarrollo de este código se realizara en google Colab

Invítame a un Café

En base a las herramientas actuales para el desarrollo de proyectos y productos de Ciencia de datos y Machine Learning, se recomienda el uso de Python, sin embargo, R es una herramienta muy poderosa para el análisis de datos y la visualización de los mismos.

Puede aprender python en el curso de Ciencia de Datos con Python.

Datos faltantes (Missing Data)

#Cargar datos
url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/missing-data.csv"
download.file(url, destfile = "missing-data.csv", mode = "wb")

data <- read.csv("missing-data.csv", na.strings = "") # donde hay espacios en blanco poner NA
data # dataframe con datos faltantes
A data.frame: 27 × 3
IncomePhone_typeCar_type
<int><chr><chr>
89800AndroidLuxury
47500AndroidNon-Luxury
45000iPhoneLuxury
44700NALuxury
59500iPhoneLuxury
NAAndroidNon-Luxury
63300iPhoneNon-Luxury
52900AndroidLuxury
78200AndroidLuxury
145100iPhoneLuxury
88600iPhoneNon-Luxury
65600iPhoneLuxury
NAAndroidNon-Luxury
94600AndroidLuxury
59400iPhoneLuxury
47300iPhoneNon-Luxury
72100NALuxury
0iPhoneNon-Luxury
0AndroidLuxury
83000iPhoneLuxury
64100AndroidNon-Luxury
42100iPhoneNon-Luxury
0iPhoneLuxury
91500iPhoneNon-Luxury
51200AndroidLuxury
13800iPhoneNon-Luxury
47500iPhoneNon-Luxury
# Quitar las filas con datos NA
data.cleaned <- na.omit(data)
data.cleaned
A data.frame: 23 × 3
IncomePhone_typeCar_type
<int><chr><chr>
189800AndroidLuxury
247500AndroidNon-Luxury
345000iPhoneLuxury
559500iPhoneLuxury
763300iPhoneNon-Luxury
852900AndroidLuxury
978200AndroidLuxury
10145100iPhoneLuxury
1188600iPhoneNon-Luxury
1265600iPhoneLuxury
1494600AndroidLuxury
1559400iPhoneLuxury
1647300iPhoneNon-Luxury
180iPhoneNon-Luxury
190AndroidLuxury
2083000iPhoneLuxury
2164100AndroidNon-Luxury
2242100iPhoneNon-Luxury
230iPhoneLuxury
2491500iPhoneNon-Luxury
2551200AndroidLuxury
2613800iPhoneNon-Luxury
2747500iPhoneNon-Luxury
# verificar si un elemento del dataframe es na
is.na(data[4,2])

TRUE

# verificar si un elemento del dataframe es na
is.na(data[4,1])

FALSE

# verificar si en una columna hay na
is.na(data$Income)
  1. FALSE
  2. FALSE
  3. FALSE
  4. FALSE
  5. FALSE
  6. TRUE
  7. FALSE
  8. FALSE
  9. FALSE
  10. FALSE
  11. FALSE
  12. FALSE
  13. TRUE
  14. FALSE
  15. FALSE
  16. FALSE
  17. FALSE
  18. FALSE
  19. FALSE
  20. FALSE
  21. FALSE
  22. FALSE
  23. FALSE
  24. FALSE
  25. FALSE
  26. FALSE
  27. FALSE

Limpieza Selectiva

#Limpiar NA de solamente la variable Income
data.income.cleaned <- data[!is.na(data$Income),]
data.income.cleaned
A data.frame: 25 × 3
IncomePhone_typeCar_type
<int><chr><chr>
189800AndroidLuxury
247500AndroidNon-Luxury
345000iPhoneLuxury
444700NALuxury
559500iPhoneLuxury
763300iPhoneNon-Luxury
852900AndroidLuxury
978200AndroidLuxury
10145100iPhoneLuxury
1188600iPhoneNon-Luxury
1265600iPhoneLuxury
1494600AndroidLuxury
1559400iPhoneLuxury
1647300iPhoneNon-Luxury
1772100NALuxury
180iPhoneNon-Luxury
190AndroidLuxury
2083000iPhoneLuxury
2164100AndroidNon-Luxury
2242100iPhoneNon-Luxury
230iPhoneLuxury
2491500iPhoneNon-Luxury
2551200AndroidLuxury
2613800iPhoneNon-Luxury
2747500iPhoneNon-Luxury
# Identificar si hay Filas completas para un data frame
complete.cases(data)
  1. TRUE
  2. TRUE
  3. TRUE
  4. FALSE
  5. TRUE
  6. FALSE
  7. TRUE
  8. TRUE
  9. TRUE
  10. TRUE
  11. TRUE
  12. TRUE
  13. FALSE
  14. TRUE
  15. TRUE
  16. TRUE
  17. FALSE
  18. TRUE
  19. TRUE
  20. TRUE
  21. TRUE
  22. TRUE
  23. TRUE
  24. TRUE
  25. TRUE
  26. TRUE
  27. TRUE
data.cleaned.2 <- data[complete.cases(data), ]
data.cleaned.2
A data.frame: 23 × 3
IncomePhone_typeCar_type
<int><chr><chr>
189800AndroidLuxury
247500AndroidNon-Luxury
345000iPhoneLuxury
559500iPhoneLuxury
763300iPhoneNon-Luxury
852900AndroidLuxury
978200AndroidLuxury
10145100iPhoneLuxury
1188600iPhoneNon-Luxury
1265600iPhoneLuxury
1494600AndroidLuxury
1559400iPhoneLuxury
1647300iPhoneNon-Luxury
180iPhoneNon-Luxury
190AndroidLuxury
2083000iPhoneLuxury
2164100AndroidNon-Luxury
2242100iPhoneNon-Luxury
230iPhoneLuxury
2491500iPhoneNon-Luxury
2551200AndroidLuxury
2613800iPhoneNon-Luxury
2747500iPhoneNon-Luxury
#Convertir los ceros de ingresos en NA
data$Income[data$Income == 0] <- NA
data
A data.frame: 27 × 3
IncomePhone_typeCar_type
<int><chr><chr>
89800AndroidLuxury
47500AndroidNon-Luxury
45000iPhoneLuxury
44700NALuxury
59500iPhoneLuxury
NAAndroidNon-Luxury
63300iPhoneNon-Luxury
52900AndroidLuxury
78200AndroidLuxury
145100iPhoneLuxury
88600iPhoneNon-Luxury
65600iPhoneLuxury
NAAndroidNon-Luxury
94600AndroidLuxury
59400iPhoneLuxury
47300iPhoneNon-Luxury
72100NALuxury
NAiPhoneNon-Luxury
NAAndroidLuxury
83000iPhoneLuxury
64100AndroidNon-Luxury
42100iPhoneNon-Luxury
NAiPhoneLuxury
91500iPhoneNon-Luxury
51200AndroidLuxury
13800iPhoneNon-Luxury
47500iPhoneNon-Luxury
#Medidas de centralización y dispersión en una fila que tiene NA
mean(data$Income)

<NA>

mean(data$Income, na.rm = TRUE) #Ignorar los datos NA

65763.6363636364

sd(data$Income)

<NA>

sd(data$Income, na.rm = TRUE) #Ignorar los datos NA

26715.8691106552

Remplazar datos faltantes

# Cargar datos y donde hay espacios en blanco poner NA
data <- read.csv("missing-data.csv", na.strings = "")

# Reemplazar los datos faltantes por el promedio de la columna
data$Income.mean <- ifelse(is.na(data$Income), #condicion
                           mean(data$Income, na.rm = TRUE), # si es verdadera
                           data$Income # else
                           )
data
A data.frame: 27 × 4
IncomePhone_typeCar_typeIncome.mean
<int><chr><chr><dbl>
89800AndroidLuxury89800
47500AndroidNon-Luxury47500
45000iPhoneLuxury45000
44700NALuxury44700
59500iPhoneLuxury59500
NAAndroidNon-Luxury57872
63300iPhoneNon-Luxury63300
52900AndroidLuxury52900
78200AndroidLuxury78200
145100iPhoneLuxury145100
88600iPhoneNon-Luxury88600
65600iPhoneLuxury65600
NAAndroidNon-Luxury57872
94600AndroidLuxury94600
59400iPhoneLuxury59400
47300iPhoneNon-Luxury47300
72100NALuxury72100
0iPhoneNon-Luxury0
0AndroidLuxury0
83000iPhoneLuxury83000
64100AndroidNon-Luxury64100
42100iPhoneNon-Luxury42100
0iPhoneLuxury0
91500iPhoneNon-Luxury91500
51200AndroidLuxury51200
13800iPhoneNon-Luxury13800
47500iPhoneNon-Luxury47500
# Reemplazar el valor faltante por un valor tomado aleatoriamente

#x es un vector de datos que puede contener NA
rand.impute <- function(x) {
  # missing contiene un vector de valores T/F dependiendo del NA de x
  missing <- is.na(x)
  #n.missing contiene cuantos valores son NA dentro de x
  n.missing <- sum(missing)
  #x.obs son los valores conocidos que tienen dato diferente de NA en x
  x.obs <- x[!missing]
  #por defecto, devolveré lo mismo que había entrado por parámetro
  imputed <- x
  #en los valores que faltaban, los reemplazamos por una muestra
  #de los que si conocemos (MAS)
  imputed[missing] <- sample(x.obs, n.missing, replace = TRUE)
  return (imputed)
}
random.impute.data.frame <- function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "imputed", sep = ".")
    dataframe[name] = rand.impute(dataframe[,col])
  }
  dataframe
}
data <- read.csv("missing-data.csv", na.strings = "")
data$Income[data$Income==0]<-NA
data <- random.impute.data.frame(data, c(1,2))
data
A data.frame: 27 × 5
IncomePhone_typeCar_typeIncome.imputedPhone_type.imputed
<int><chr><chr><int><chr>
89800AndroidLuxury89800Android
47500AndroidNon-Luxury47500Android
45000iPhoneLuxury45000iPhone
44700NALuxury44700iPhone
59500iPhoneLuxury59500iPhone
NAAndroidNon-Luxury51200Android
63300iPhoneNon-Luxury63300iPhone
52900AndroidLuxury52900Android
78200AndroidLuxury78200Android
145100iPhoneLuxury145100iPhone
88600iPhoneNon-Luxury88600iPhone
65600iPhoneLuxury65600iPhone
NAAndroidNon-Luxury59500Android
94600AndroidLuxury94600Android
59400iPhoneLuxury59400iPhone
47300iPhoneNon-Luxury47300iPhone
72100NALuxury72100Android
NAiPhoneNon-Luxury59500iPhone
NAAndroidLuxury59500Android
83000iPhoneLuxury83000iPhone
64100AndroidNon-Luxury64100Android
42100iPhoneNon-Luxury42100iPhone
NAiPhoneLuxury59500iPhone
91500iPhoneNon-Luxury91500iPhone
51200AndroidLuxury51200Android
13800iPhoneNon-Luxury13800iPhone
47500iPhoneNon-Luxury47500iPhone

Eliminar Duplicados

# creacion de dataframe con datos duplicados
family.salary = c(40000, 60000, 50000, 80000, 60000, 70000, 60000)
family.size = c(4, 3, 2, 2, 3, 4, 3)
family.car = c("Lujo", "Compacto", "Utilitario", "Lujo",
               "Compacto", "Compacto", "Compacto")

family <- data.frame(family.salary, family.size, family.car)
family
A data.frame: 7 × 3
family.salaryfamily.sizefamily.car
<dbl><dbl><chr>
400004Lujo
600003Compacto
500002Utilitario
800002Lujo
600003Compacto
700004Compacto
600003Compacto
# Devuelve los datos que no se repiten
family.unique <- unique(family)
family.unique
A data.frame: 5 × 3
family.salaryfamily.sizefamily.car
<dbl><dbl><chr>
1400004Lujo
2600003Compacto
3500002Utilitario
4800002Lujo
6700004Compacto
#detectar cuales son los que estas duplicados
duplicated(family)
  1. FALSE
  2. FALSE
  3. FALSE
  4. FALSE
  5. TRUE
  6. FALSE
  7. TRUE
# Obtener los datos duplicados
family[duplicated(family),]
A data.frame: 2 × 3
family.salaryfamily.sizefamily.car
<dbl><dbl><chr>
5600003Compacto
7600003Compacto

Re-escalar datos

Escalar los datos de 0 a 1 o de 0 a 100

# instalarlo si no lo tiene
#install.packages("scales")
library(scales)

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/data-conversion.csv"
download.file(url, destfile = "data-conversion.csv", mode = "wb")

students <- read.csv("data-conversion.csv")
students
A data.frame: 10 × 5
AgeStateGenderHeightIncome
<int><chr><chr><int><int>
23NJF615000
13NYM551000
36NJM663000
31VAF644000
58NYF7030000
29TXF6310000
39NJM6750000
50VAM7055000
23TXF612000
36VAM6620000
# el valor minimo lo hace = 0
# El valor maximo es igual a 1
# es una tarnsformacion lineal
students$Income.rescaled <- rescale(students$Income)
students
A data.frame: 10 × 6
AgeStateGenderHeightIncomeIncome.rescaled
<int><chr><chr><int><int><dbl>
23NJF6150000.07407407
13NYM5510000.00000000
36NJM6630000.03703704
31VAF6440000.05555556
58NYF70300000.53703704
29TXF63100000.16666667
39NJM67500000.90740741
50VAM70550001.00000000
23TXF6120000.01851852
36VAM66200000.35185185
# Reescalado  con la formula
students$Income.rescaled2 <-(students$Income - min(students$Income))/ (max(students$Income) - min(students$Income))
students
A data.frame: 10 × 7
AgeStateGenderHeightIncomeIncome.rescaledIncome.rescaled2
<int><chr><chr><int><int><dbl><dbl>
23NJF6150000.074074070.07407407
13NYM5510000.000000000.00000000
36NJM6630000.037037040.03703704
31VAF6440000.055555560.05555556
58NYF70300000.537037040.53703704
29TXF63100000.166666670.16666667
39NJM67500000.907407410.90740741
50VAM70550001.000000001.00000000
23TXF6120000.018518520.01851852
36VAM66200000.351851850.35185185
#  haccer un reescaldo de 0 a 100
students$Income.rescaled3 <- rescale(students$Income, to = c(0, 100))
students
A data.frame: 10 × 8
AgeStateGenderHeightIncomeIncome.rescaledIncome.rescaled2Income.rescaled3
<int><chr><chr><int><int><dbl><dbl><dbl>
23NJF6150000.074074070.074074077.407407
13NYM5510000.000000000.000000000.000000
36NJM6630000.037037040.037037043.703704
31VAF6440000.055555560.055555565.555556
58NYF70300000.537037040.5370370453.703704
29TXF63100000.166666670.1666666716.666667
39NJM67500000.907407410.9074074190.740741
50VAM70550001.000000001.00000000100.000000
23TXF6120000.018518520.018518521.851852
36VAM66200000.351851850.3518518535.185185
# Funcion para reescalar varias columnas
rescale.many <- function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "rescaled", sep = ".")
    dataframe[name] <- rescale(dataframe[,col])
  }
  cat(paste("Hemos reescalado ", length(cols), " variable(s)"))
  dataframe
}
students <- rescale.many(students, c(1,4))
students
Hemos reescalado  2  variable(s)
A data.frame: 10 × 10
AgeStateGenderHeightIncomeIncome.rescaledIncome.rescaled2Income.rescaled3Age.rescaledHeight.rescaled
<int><chr><chr><int><int><dbl><dbl><dbl><dbl><dbl>
23NJF6150000.074074070.074074077.4074070.22222220.4000000
13NYM5510000.000000000.000000000.0000000.00000000.0000000
36NJM6630000.037037040.037037043.7037040.51111110.7333333
31VAF6440000.055555560.055555565.5555560.40000000.6000000
58NYF70300000.537037040.5370370453.7037041.00000001.0000000
29TXF63100000.166666670.1666666716.6666670.35555560.5333333
39NJM67500000.907407410.9074074190.7407410.57777780.8000000
50VAM70550001.000000001.00000000100.0000000.82222221.0000000
23TXF6120000.018518520.018518521.8518520.22222220.4000000
36VAM66200000.351851850.3518518535.1851850.51111110.7333333

Normalizacion

#https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/BostonHousing.csv"
download.file(url, destfile = "BostonHousing.csv", mode = "wb")


housing <- read.csv("BostonHousing.csv")
head(housing)
A data.frame: 6 × 14
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATMEDV
<dbl><dbl><dbl><int><dbl><dbl><dbl><dbl><int><int><dbl><dbl><dbl><dbl>
10.00632182.3100.5386.57565.24.0900129615.3396.904.9824.0
20.0273107.0700.4696.42178.94.9671224217.8396.909.1421.6
30.0272907.0700.4697.18561.14.9671224217.8392.834.0334.7
40.0323702.1800.4586.99845.86.0622322218.7394.632.9433.4
50.0690502.1800.4587.14754.26.0622322218.7396.905.3336.2
60.0298502.1800.4586.43058.76.0622322218.7394.125.2128.7
# Usando la normal estandar para todos los datos
# https://es.wikipedia.org/wiki/Distribuci%C3%B3n_normal#Estandarizaci%C3%B3n_de_variables_aleatorias_normales
# centrar en 0
housing.z <- scale(housing, center = TRUE, scale = TRUE)
head(housing.z)
A matrix: 6 × 14 of type dbl
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATMEDV
-0.41936690.2845483-1.2866362-0.2723291-0.14407490.4132629-0.11989480.140075-0.9818712-0.6659492-1.45755800.4406159-1.07449900.1595278
-0.4169267-0.4872402-0.5927944-0.2723291-0.73953040.19408240.36680340.556609-0.8670245-0.9863534-0.30279450.4406159-0.4919525-0.1014239
-0.4169290-0.4872402-0.5927944-0.2723291-0.73953041.2814456-0.26554900.556609-0.8670245-0.9863534-0.30279450.3960351-1.20753241.3229375
-0.4163384-0.4872402-1.3055857-0.2723291-0.83445811.0152978-0.80908781.076671-0.7521778-1.10502160.11292030.4157514-1.36017081.1815886
-0.4120741-0.4872402-1.3055857-0.2723291-0.83445811.2273620-0.51067431.076671-0.7521778-1.10502160.11292030.4406159-1.02548661.4860323
-0.4166314-0.4872402-1.3055857-0.2723291-0.83445810.2068916-0.35081001.076671-0.7521778-1.10502160.11292030.4101651-1.04229090.6705582
# restarle la resta la media a cada valor, esto desplaza la distribucion
housing.mean <- scale(housing, center = TRUE, scale = FALSE)
head(housing.mean)
A matrix: 6 × 14 of type dbl
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATMEDV
-3.6072046.636364-8.826779-0.06916996-0.016695060.2903656-3.3749010.2949573-8.549407-112.2372-3.155533640.22597-7.6730631.4671937
-3.586214-11.363636-4.066779-0.06916996-0.085695060.136365610.3250991.1720573-7.549407-166.2372-0.655533640.22597-3.513063-0.9328063
-3.586234-11.363636-4.066779-0.06916996-0.085695060.9003656-7.4749011.1720573-7.549407-166.2372-0.655533636.15597-8.62306312.1671937
-3.581154-11.363636-8.956779-0.06916996-0.096695060.7133656-22.7749012.2671573-6.549407-186.23720.244466437.95597-9.71306310.8671937
-3.544474-11.363636-8.956779-0.06916996-0.096695060.8623656-14.3749012.2671573-6.549407-186.23720.244466440.22597-7.32306313.6671937
-3.583674-11.363636-8.956779-0.06916996-0.096695060.1453656-9.8749012.2671573-6.549407-186.23720.244466437.44597-7.4430636.1671937
# Desviacion estandard
housing.sd <- scale(housing, center = FALSE, scale = TRUE)
head(housing.sd)
A matrix: 6 × 14 of type dbl
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATMEDV
0.00067730270.69368170.176475900.94850771.0387100.87882210.94166460.077338780.66963470.82257091.0770260.34250330.9852965
0.00292676240.00000000.540123100.82685901.0143811.06348261.14360450.154677550.54747160.95697791.0770260.62861040.8867668
0.00292461900.00000000.540123100.82685901.1350770.82355881.14360450.154677550.54747160.95697791.0659820.27716631.4245745
0.00346903320.00000000.166544300.80746571.1055350.61733211.39573570.232016330.50222601.00536451.0708670.20220071.3712043
0.00739996120.00000000.166544300.80746571.1290730.73055461.39573570.232016330.50222601.00536451.0770260.36657481.4861555
0.00319896950.00000000.166544300.80746571.0158030.79120951.39573570.232016330.50222601.00536451.0694830.35832171.1782504
# NO hace nada
housing.none <- scale(housing, center = FALSE, scale = FALSE)
head(housing.none)
A matrix: 6 × 14 of type dbl
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATMEDV
0.00632182.3100.5386.57565.24.0900129615.3396.904.9824.0
0.0273107.0700.4696.42178.94.9671224217.8396.909.1421.6
0.0272907.0700.4697.18561.14.9671224217.8392.834.0334.7
0.0323702.1800.4586.99845.86.0622322218.7394.632.9433.4
0.0690502.1800.4587.14754.26.0622322218.7396.905.3336.2
0.0298502.1800.4586.43058.76.0622322218.7394.125.2128.7
#sd = sqrt(sum(x^2)/(n-1))
# funcion para escalar varias columnas
scale.many = function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "z", sep = ".")
    dataframe[name] <- scale(dataframe[,col])
  }
  cat(paste("Hemos normalizado ", length(cols), " variable(s)"))
  dataframe
}
housing <- scale.many(housing, c(1, 3, 5:8))
Hemos normalizado  6  variable(s)

Data Binning

Categorizacion de variables numericas

# leer los datos
students <- read.csv("data-conversion.csv")
students
A data.frame: 10 × 5
AgeStateGenderHeightIncome
<int><chr><chr><int><int>
23NJF615000
13NYM551000
36NJM663000
31VAF644000
58NYF7030000
29TXF6310000
39NJM6750000
50VAM7055000
23TXF612000
36VAM6620000
# creacion de vectores de separacion
# convertir el income en 3 categorias
bp <- c(-Inf, 10000, 31000, Inf)
names <- c("Low", "Average", "High")
# cut para dividir los datos
students$Income.cat <- cut(students$Income, breaks = bp, labels = names)
students
A data.frame: 10 × 6
AgeStateGenderHeightIncomeIncome.cat
<int><chr><chr><int><int><fct>
23NJF615000Low
13NYM551000Low
36NJM663000Low
31VAF644000Low
58NYF7030000Average
29TXF6310000Low
39NJM6750000High
50VAM7055000High
23TXF612000Low
36VAM6620000Average
# cortar los datos sin darle nombres
students$Income.cat2 <- cut(students$Income, breaks = bp)
students
A data.frame: 10 × 7
AgeStateGenderHeightIncomeIncome.catIncome.cat2
<int><chr><chr><int><int><fct><fct>
23NJF615000Low(-Inf,1e+04]
13NYM551000Low(-Inf,1e+04]
36NJM663000Low(-Inf,1e+04]
31VAF644000Low(-Inf,1e+04]
58NYF7030000Average(1e+04,3.1e+04]
29TXF6310000Low(-Inf,1e+04]
39NJM6750000High(3.1e+04, Inf]
50VAM7055000High(3.1e+04, Inf]
23TXF612000Low(-Inf,1e+04]
36VAM6620000Average(1e+04,3.1e+04]
# podemos poner cualquier nombre para las categorias que se crean
# no es necesario definir donde son los puntos de corte
students$Income.cat3 <- cut(students$Income,
                            breaks = 4,
                            labels = c("Level 1", "Level 2",
                                       "Level 3", "Level 4")
                            )
students
A data.frame: 10 × 8
AgeStateGenderHeightIncomeIncome.catIncome.cat2Income.cat3
<int><chr><chr><int><int><fct><fct><fct>
23NJF615000Low(-Inf,1e+04]Level 1
13NYM551000Low(-Inf,1e+04]Level 1
36NJM663000Low(-Inf,1e+04]Level 1
31VAF644000Low(-Inf,1e+04]Level 1
58NYF7030000Average(1e+04,3.1e+04]Level 3
29TXF6310000Low(-Inf,1e+04]Level 1
39NJM6750000High(3.1e+04, Inf]Level 4
50VAM7055000High(3.1e+04, Inf]Level 4
23TXF612000Low(-Inf,1e+04]Level 1
36VAM6620000Average(1e+04,3.1e+04]Level 2

dummy variables

install.packages("fastDummies")
library(fastDummies)
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Thank you for using fastDummies!

To acknowledge our work, please cite the package:

Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
#dummy variables
students <- read.csv("data-conversion.csv")
students.dummy <- dummy_cols(students)
head(students.dummy)
A data.frame: 6 × 11
AgeStateGenderHeightIncomeState_NJState_NYState_TXState_VAGender_FGender_M
<int><chr><chr><int><int><int><int><int><int><int><int>
123NJF615000100010
213NYM551000010001
336NJM663000100001
431VAF644000000110
558NYF7030000010010
629TXF6310000001010
# crear una variable dummy de state
dummy_cols(students,
           select_columns = "State")
A data.frame: 10 × 9
AgeStateGenderHeightIncomeState_NJState_NYState_TXState_VA
<int><chr><chr><int><int><int><int><int><int>
23NJF6150001000
13NYM5510000100
36NJM6630001000
31VAF6440000001
58NYF70300000100
29TXF63100000010
39NJM67500001000
50VAM70550000001
23TXF6120000010
36VAM66200000001
# Crear dummies columns de state y Gender
dummy_cols(students,
           select_columns = c("State", "Gender"))
A data.frame: 10 × 11
AgeStateGenderHeightIncomeState_NJState_NYState_TXState_VAGender_FGender_M
<int><chr><chr><int><int><int><int><int><int><int><int>
23NJF615000100010
13NYM551000010001
36NJM663000100001
31VAF644000000110
58NYF7030000010010
29TXF6310000001010
39NJM6750000100001
50VAM7055000000101
23TXF612000001010
36VAM6620000000101

Outliers

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/ozone.csv"
download.file(url, destfile = "ozone.csv", mode = "wb")

ozone.data <- read.csv("ozone.csv", stringsAsFactors = F) # No convierte NA a string
boxplot(ozone.data$pressure_height,
        main = "Pressure Height",
        boxwex = 0.5)$out
  1. 5480
  2. 5410
  3. 5350
  4. 5480
  5. 5490
  6. 5470
  7. 5320
  8. 5420
  9. 5440
  10. 5480
  11. 5450
  12. 5500

png

summary(ozone.data$pressure_height)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   5320    5700    5770    5753    5830    5950      12
boxplot(pressure_height ~ Month,
        data = ozone.data,
        main = "Presure Height per Month"
        )

png

boxplot(ozone_reading ~Month,
        data = ozone.data,
        main = "Ozone reading per Month")$out
  1. 11.06
  2. 9.93
  3. 22.89
  4. 24.29
  5. 29.79

png

# funcion para cambiar los outliers por la media y por la mediana
impute_outliers <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- mean(x, na.rm = removeNA)
  x[x>quantiles[2]] <- median(x, na.rm = removeNA)
  x
}
imputed_data <- impute_outliers(ozone.data$pressure_height)
par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers")
boxplot(imputed_data, main = "Presión sin outliers")

png

replace_outliers <- function(x, removeNA = TRUE){
  qrts <- quantile(x, probs = c(0.25, 0.75), na.rm = removeNA)
  caps <- quantile(x, probs = c(.05, .95), na.rm = removeNA)
  iqr <- qrts[2]-qrts[1]
  h <- 1.5 * iqr
  x[x<qrts[1]-h] <- caps[1]
  x[x>qrts[2]+h] <- caps[2]
  x
}
capped_pressure_height <- replace_outliers(ozone.data$pressure_height)
par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers")
boxplot(capped_pressure_height, main = "Presión sin outliers")

png

Referencias