Por Jose R. Zapata

En este notebook se encuentra temas de preparación de datos con R como: datos faltantes, datos duplicados, datos atípicos, normalización, estandarización, codificación de variables categóricas, etc.

El desarrollo de este código se realizara en google Colab

En base a las herramientas actuales para el desarrollo de proyectos y productos de Ciencia de datos y Machine Learning, se recomienda el uso de Python, sin embargo, R es una herramienta muy poderosa para el análisis de datos y la visualización de los mismos.

Puede aprender python en el curso de Ciencia de Datos con Python.

Datos faltantes (Missing Data)

#Cargar datos
url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/missing-data.csv"
download.file(url, destfile = "missing-data.csv", mode = "wb")

data <- read.csv("missing-data.csv", na.strings = "") # donde hay espacios en blanco poner NA
data # dataframe con datos faltantes

A data.frame: 27 × 3
Income	Phone_type	Car_type
<int>	<chr>	<chr>
89800	Android	Luxury
47500	Android	Non-Luxury
45000	iPhone	Luxury
44700	NA	Luxury
59500	iPhone	Luxury
NA	Android	Non-Luxury
63300	iPhone	Non-Luxury
52900	Android	Luxury
78200	Android	Luxury
145100	iPhone	Luxury
88600	iPhone	Non-Luxury
65600	iPhone	Luxury
NA	Android	Non-Luxury
94600	Android	Luxury
59400	iPhone	Luxury
47300	iPhone	Non-Luxury
72100	NA	Luxury
0	iPhone	Non-Luxury
0	Android	Luxury
83000	iPhone	Luxury
64100	Android	Non-Luxury
42100	iPhone	Non-Luxury
0	iPhone	Luxury
91500	iPhone	Non-Luxury
51200	Android	Luxury
13800	iPhone	Non-Luxury
47500	iPhone	Non-Luxury

# Quitar las filas con datos NA
data.cleaned <- na.omit(data)
data.cleaned

A data.frame: 23 × 3
	Income	Phone_type	Car_type
	<int>	<chr>	<chr>
1	89800	Android	Luxury
2	47500	Android	Non-Luxury
3	45000	iPhone	Luxury
5	59500	iPhone	Luxury
7	63300	iPhone	Non-Luxury
8	52900	Android	Luxury
9	78200	Android	Luxury
10	145100	iPhone	Luxury
11	88600	iPhone	Non-Luxury
12	65600	iPhone	Luxury
14	94600	Android	Luxury
15	59400	iPhone	Luxury
16	47300	iPhone	Non-Luxury
18	0	iPhone	Non-Luxury
19	0	Android	Luxury
20	83000	iPhone	Luxury
21	64100	Android	Non-Luxury
22	42100	iPhone	Non-Luxury
23	0	iPhone	Luxury
24	91500	iPhone	Non-Luxury
25	51200	Android	Luxury
26	13800	iPhone	Non-Luxury
27	47500	iPhone	Non-Luxury

# verificar si un elemento del dataframe es na
is.na(data[4,2])

TRUE

# verificar si un elemento del dataframe es na
is.na(data[4,1])

FALSE

# verificar si en una columna hay na
is.na(data$Income)

FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE

Limpieza Selectiva

#Limpiar NA de solamente la variable Income
data.income.cleaned <- data[!is.na(data$Income),]
data.income.cleaned

A data.frame: 25 × 3
	Income	Phone_type	Car_type
	<int>	<chr>	<chr>
1	89800	Android	Luxury
2	47500	Android	Non-Luxury
3	45000	iPhone	Luxury
4	44700	NA	Luxury
5	59500	iPhone	Luxury
7	63300	iPhone	Non-Luxury
8	52900	Android	Luxury
9	78200	Android	Luxury
10	145100	iPhone	Luxury
11	88600	iPhone	Non-Luxury
12	65600	iPhone	Luxury
14	94600	Android	Luxury
15	59400	iPhone	Luxury
16	47300	iPhone	Non-Luxury
17	72100	NA	Luxury
18	0	iPhone	Non-Luxury
19	0	Android	Luxury
20	83000	iPhone	Luxury
21	64100	Android	Non-Luxury
22	42100	iPhone	Non-Luxury
23	0	iPhone	Luxury
24	91500	iPhone	Non-Luxury
25	51200	Android	Luxury
26	13800	iPhone	Non-Luxury
27	47500	iPhone	Non-Luxury

# Identificar si hay Filas completas para un data frame
complete.cases(data)

TRUE
TRUE
TRUE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE

data.cleaned.2 <- data[complete.cases(data), ]
data.cleaned.2

A data.frame: 23 × 3
	Income	Phone_type	Car_type
	<int>	<chr>	<chr>
1	89800	Android	Luxury
2	47500	Android	Non-Luxury
3	45000	iPhone	Luxury
5	59500	iPhone	Luxury
7	63300	iPhone	Non-Luxury
8	52900	Android	Luxury
9	78200	Android	Luxury
10	145100	iPhone	Luxury
11	88600	iPhone	Non-Luxury
12	65600	iPhone	Luxury
14	94600	Android	Luxury
15	59400	iPhone	Luxury
16	47300	iPhone	Non-Luxury
18	0	iPhone	Non-Luxury
19	0	Android	Luxury
20	83000	iPhone	Luxury
21	64100	Android	Non-Luxury
22	42100	iPhone	Non-Luxury
23	0	iPhone	Luxury
24	91500	iPhone	Non-Luxury
25	51200	Android	Luxury
26	13800	iPhone	Non-Luxury
27	47500	iPhone	Non-Luxury

#Convertir los ceros de ingresos en NA
data$Income[data$Income == 0] <- NA
data

A data.frame: 27 × 3
Income	Phone_type	Car_type
<int>	<chr>	<chr>
89800	Android	Luxury
47500	Android	Non-Luxury
45000	iPhone	Luxury
44700	NA	Luxury
59500	iPhone	Luxury
NA	Android	Non-Luxury
63300	iPhone	Non-Luxury
52900	Android	Luxury
78200	Android	Luxury
145100	iPhone	Luxury
88600	iPhone	Non-Luxury
65600	iPhone	Luxury
NA	Android	Non-Luxury
94600	Android	Luxury
59400	iPhone	Luxury
47300	iPhone	Non-Luxury
72100	NA	Luxury
NA	iPhone	Non-Luxury
NA	Android	Luxury
83000	iPhone	Luxury
64100	Android	Non-Luxury
42100	iPhone	Non-Luxury
NA	iPhone	Luxury
91500	iPhone	Non-Luxury
51200	Android	Luxury
13800	iPhone	Non-Luxury
47500	iPhone	Non-Luxury

#Medidas de centralización y dispersión en una fila que tiene NA
mean(data$Income)

<NA>

mean(data$Income, na.rm = TRUE) #Ignorar los datos NA

65763.6363636364

sd(data$Income)

<NA>

sd(data$Income, na.rm = TRUE) #Ignorar los datos NA

26715.8691106552

Remplazar datos faltantes

# Cargar datos y donde hay espacios en blanco poner NA
data <- read.csv("missing-data.csv", na.strings = "")

# Reemplazar los datos faltantes por el promedio de la columna
data$Income.mean <- ifelse(is.na(data$Income), #condicion
                           mean(data$Income, na.rm = TRUE), # si es verdadera
                           data$Income # else
                           )
data

A data.frame: 27 × 4
Income	Phone_type	Car_type	Income.mean
<int>	<chr>	<chr>	<dbl>
89800	Android	Luxury	89800
47500	Android	Non-Luxury	47500
45000	iPhone	Luxury	45000
44700	NA	Luxury	44700
59500	iPhone	Luxury	59500
NA	Android	Non-Luxury	57872
63300	iPhone	Non-Luxury	63300
52900	Android	Luxury	52900
78200	Android	Luxury	78200
145100	iPhone	Luxury	145100
88600	iPhone	Non-Luxury	88600
65600	iPhone	Luxury	65600
NA	Android	Non-Luxury	57872
94600	Android	Luxury	94600
59400	iPhone	Luxury	59400
47300	iPhone	Non-Luxury	47300
72100	NA	Luxury	72100
0	iPhone	Non-Luxury	0
0	Android	Luxury	0
83000	iPhone	Luxury	83000
64100	Android	Non-Luxury	64100
42100	iPhone	Non-Luxury	42100
0	iPhone	Luxury	0
91500	iPhone	Non-Luxury	91500
51200	Android	Luxury	51200
13800	iPhone	Non-Luxury	13800
47500	iPhone	Non-Luxury	47500

# Reemplazar el valor faltante por un valor tomado aleatoriamente

#x es un vector de datos que puede contener NA
rand.impute <- function(x) {
  # missing contiene un vector de valores T/F dependiendo del NA de x
  missing <- is.na(x)
  #n.missing contiene cuantos valores son NA dentro de x
  n.missing <- sum(missing)
  #x.obs son los valores conocidos que tienen dato diferente de NA en x
  x.obs <- x[!missing]
  #por defecto, devolveré lo mismo que había entrado por parámetro
  imputed <- x
  #en los valores que faltaban, los reemplazamos por una muestra
  #de los que si conocemos (MAS)
  imputed[missing] <- sample(x.obs, n.missing, replace = TRUE)
  return (imputed)
}

random.impute.data.frame <- function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "imputed", sep = ".")
    dataframe[name] = rand.impute(dataframe[,col])
  }
  dataframe
}

data <- read.csv("missing-data.csv", na.strings = "")
data$Income[data$Income==0]<-NA
data <- random.impute.data.frame(data, c(1,2))

data

A data.frame: 27 × 5
Income	Phone_type	Car_type	Income.imputed	Phone_type.imputed
<int>	<chr>	<chr>	<int>	<chr>
89800	Android	Luxury	89800	Android
47500	Android	Non-Luxury	47500	Android
45000	iPhone	Luxury	45000	iPhone
44700	NA	Luxury	44700	iPhone
59500	iPhone	Luxury	59500	iPhone
NA	Android	Non-Luxury	51200	Android
63300	iPhone	Non-Luxury	63300	iPhone
52900	Android	Luxury	52900	Android
78200	Android	Luxury	78200	Android
145100	iPhone	Luxury	145100	iPhone
88600	iPhone	Non-Luxury	88600	iPhone
65600	iPhone	Luxury	65600	iPhone
NA	Android	Non-Luxury	59500	Android
94600	Android	Luxury	94600	Android
59400	iPhone	Luxury	59400	iPhone
47300	iPhone	Non-Luxury	47300	iPhone
72100	NA	Luxury	72100	Android
NA	iPhone	Non-Luxury	59500	iPhone
NA	Android	Luxury	59500	Android
83000	iPhone	Luxury	83000	iPhone
64100	Android	Non-Luxury	64100	Android
42100	iPhone	Non-Luxury	42100	iPhone
NA	iPhone	Luxury	59500	iPhone
91500	iPhone	Non-Luxury	91500	iPhone
51200	Android	Luxury	51200	Android
13800	iPhone	Non-Luxury	13800	iPhone
47500	iPhone	Non-Luxury	47500	iPhone

Eliminar Duplicados

# creacion de dataframe con datos duplicados
family.salary = c(40000, 60000, 50000, 80000, 60000, 70000, 60000)
family.size = c(4, 3, 2, 2, 3, 4, 3)
family.car = c("Lujo", "Compacto", "Utilitario", "Lujo",
               "Compacto", "Compacto", "Compacto")

family <- data.frame(family.salary, family.size, family.car)
family

A data.frame: 7 × 3
family.salary	family.size	family.car
<dbl>	<dbl>	<chr>
40000	4	Lujo
60000	3	Compacto
50000	2	Utilitario
80000	2	Lujo
60000	3	Compacto
70000	4	Compacto
60000	3	Compacto

# Devuelve los datos que no se repiten
family.unique <- unique(family)
family.unique

A data.frame: 5 × 3
	family.salary	family.size	family.car
	<dbl>	<dbl>	<chr>
1	40000	4	Lujo
2	60000	3	Compacto
3	50000	2	Utilitario
4	80000	2	Lujo
6	70000	4	Compacto

#detectar cuales son los que estas duplicados
duplicated(family)

FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE

# Obtener los datos duplicados
family[duplicated(family),]

A data.frame: 2 × 3
	family.salary	family.size	family.car
	<dbl>	<dbl>	<chr>
5	60000	3	Compacto
7	60000	3	Compacto

Re-escalar datos

Escalar los datos de 0 a 1 o de 0 a 100

# instalarlo si no lo tiene
#install.packages("scales")
library(scales)

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/data-conversion.csv"
download.file(url, destfile = "data-conversion.csv", mode = "wb")

students <- read.csv("data-conversion.csv")
students

A data.frame: 10 × 5
Age	State	Gender	Height	Income
<int>	<chr>	<chr>	<int>	<int>
23	NJ	F	61	5000
13	NY	M	55	1000
36	NJ	M	66	3000
31	VA	F	64	4000
58	NY	F	70	30000
29	TX	F	63	10000
39	NJ	M	67	50000
50	VA	M	70	55000
23	TX	F	61	2000
36	VA	M	66	20000

# el valor minimo lo hace = 0
# El valor maximo es igual a 1
# es una tarnsformacion lineal
students$Income.rescaled <- rescale(students$Income)
students

A data.frame: 10 × 6
Age	State	Gender	Height	Income	Income.rescaled
<int>	<chr>	<chr>	<int>	<int>	<dbl>
23	NJ	F	61	5000	0.07407407
13	NY	M	55	1000	0.00000000
36	NJ	M	66	3000	0.03703704
31	VA	F	64	4000	0.05555556
58	NY	F	70	30000	0.53703704
29	TX	F	63	10000	0.16666667
39	NJ	M	67	50000	0.90740741
50	VA	M	70	55000	1.00000000
23	TX	F	61	2000	0.01851852
36	VA	M	66	20000	0.35185185

# Reescalado  con la formula
students$Income.rescaled2 <-(students$Income - min(students$Income))/ (max(students$Income) - min(students$Income))
students

A data.frame: 10 × 7
Age	State	Gender	Height	Income	Income.rescaled	Income.rescaled2
<int>	<chr>	<chr>	<int>	<int>	<dbl>	<dbl>
23	NJ	F	61	5000	0.07407407	0.07407407
13	NY	M	55	1000	0.00000000	0.00000000
36	NJ	M	66	3000	0.03703704	0.03703704
31	VA	F	64	4000	0.05555556	0.05555556
58	NY	F	70	30000	0.53703704	0.53703704
29	TX	F	63	10000	0.16666667	0.16666667
39	NJ	M	67	50000	0.90740741	0.90740741
50	VA	M	70	55000	1.00000000	1.00000000
23	TX	F	61	2000	0.01851852	0.01851852
36	VA	M	66	20000	0.35185185	0.35185185

#  haccer un reescaldo de 0 a 100
students$Income.rescaled3 <- rescale(students$Income, to = c(0, 100))
students

A data.frame: 10 × 8
Age	State	Gender	Height	Income	Income.rescaled	Income.rescaled2	Income.rescaled3
<int>	<chr>	<chr>	<int>	<int>	<dbl>	<dbl>	<dbl>
23	NJ	F	61	5000	0.07407407	0.07407407	7.407407
13	NY	M	55	1000	0.00000000	0.00000000	0.000000
36	NJ	M	66	3000	0.03703704	0.03703704	3.703704
31	VA	F	64	4000	0.05555556	0.05555556	5.555556
58	NY	F	70	30000	0.53703704	0.53703704	53.703704
29	TX	F	63	10000	0.16666667	0.16666667	16.666667
39	NJ	M	67	50000	0.90740741	0.90740741	90.740741
50	VA	M	70	55000	1.00000000	1.00000000	100.000000
23	TX	F	61	2000	0.01851852	0.01851852	1.851852
36	VA	M	66	20000	0.35185185	0.35185185	35.185185

# Funcion para reescalar varias columnas
rescale.many <- function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "rescaled", sep = ".")
    dataframe[name] <- rescale(dataframe[,col])
  }
  cat(paste("Hemos reescalado ", length(cols), " variable(s)"))
  dataframe
}

students <- rescale.many(students, c(1,4))
students

Hemos reescalado  2  variable(s)

A data.frame: 10 × 10
Age	State	Gender	Height	Income	Income.rescaled	Income.rescaled2	Income.rescaled3	Age.rescaled	Height.rescaled
<int>	<chr>	<chr>	<int>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
23	NJ	F	61	5000	0.07407407	0.07407407	7.407407	0.2222222	0.4000000
13	NY	M	55	1000	0.00000000	0.00000000	0.000000	0.0000000	0.0000000
36	NJ	M	66	3000	0.03703704	0.03703704	3.703704	0.5111111	0.7333333
31	VA	F	64	4000	0.05555556	0.05555556	5.555556	0.4000000	0.6000000
58	NY	F	70	30000	0.53703704	0.53703704	53.703704	1.0000000	1.0000000
29	TX	F	63	10000	0.16666667	0.16666667	16.666667	0.3555556	0.5333333
39	NJ	M	67	50000	0.90740741	0.90740741	90.740741	0.5777778	0.8000000
50	VA	M	70	55000	1.00000000	1.00000000	100.000000	0.8222222	1.0000000
23	TX	F	61	2000	0.01851852	0.01851852	1.851852	0.2222222	0.4000000
36	VA	M	66	20000	0.35185185	0.35185185	35.185185	0.5111111	0.7333333

Normalizacion

#https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/BostonHousing.csv"
download.file(url, destfile = "BostonHousing.csv", mode = "wb")


housing <- read.csv("BostonHousing.csv")
head(housing)

A data.frame: 6 × 14
	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
	<dbl>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<int>	<dbl>	<dbl>	<dbl>	<dbl>
1	0.00632	18	2.31	0	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
2	0.02731	0	7.07	0	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
3	0.02729	0	7.07	0	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0	2.18	0	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0	2.18	0	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2
6	0.02985	0	2.18	0	0.458	6.430	58.7	6.0622	3	222	18.7	394.12	5.21	28.7

# Usando la normal estandar para todos los datos
# https://es.wikipedia.org/wiki/Distribuci%C3%B3n_normal#Estandarizaci%C3%B3n_de_variables_aleatorias_normales
# centrar en 0
housing.z <- scale(housing, center = TRUE, scale = TRUE)
head(housing.z)

A matrix: 6 × 14 of type dbl
CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
-0.4193669	0.2845483	-1.2866362	-0.2723291	-0.1440749	0.4132629	-0.1198948	0.140075	-0.9818712	-0.6659492	-1.4575580	0.4406159	-1.0744990	0.1595278
-0.4169267	-0.4872402	-0.5927944	-0.2723291	-0.7395304	0.1940824	0.3668034	0.556609	-0.8670245	-0.9863534	-0.3027945	0.4406159	-0.4919525	-0.1014239
-0.4169290	-0.4872402	-0.5927944	-0.2723291	-0.7395304	1.2814456	-0.2655490	0.556609	-0.8670245	-0.9863534	-0.3027945	0.3960351	-1.2075324	1.3229375
-0.4163384	-0.4872402	-1.3055857	-0.2723291	-0.8344581	1.0152978	-0.8090878	1.076671	-0.7521778	-1.1050216	0.1129203	0.4157514	-1.3601708	1.1815886
-0.4120741	-0.4872402	-1.3055857	-0.2723291	-0.8344581	1.2273620	-0.5106743	1.076671	-0.7521778	-1.1050216	0.1129203	0.4406159	-1.0254866	1.4860323
-0.4166314	-0.4872402	-1.3055857	-0.2723291	-0.8344581	0.2068916	-0.3508100	1.076671	-0.7521778	-1.1050216	0.1129203	0.4101651	-1.0422909	0.6705582

# restarle la resta la media a cada valor, esto desplaza la distribucion
housing.mean <- scale(housing, center = TRUE, scale = FALSE)
head(housing.mean)

A matrix: 6 × 14 of type dbl
CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
-3.607204	6.636364	-8.826779	-0.06916996	-0.01669506	0.2903656	-3.374901	0.2949573	-8.549407	-112.2372	-3.1555336	40.22597	-7.673063	1.4671937
-3.586214	-11.363636	-4.066779	-0.06916996	-0.08569506	0.1363656	10.325099	1.1720573	-7.549407	-166.2372	-0.6555336	40.22597	-3.513063	-0.9328063
-3.586234	-11.363636	-4.066779	-0.06916996	-0.08569506	0.9003656	-7.474901	1.1720573	-7.549407	-166.2372	-0.6555336	36.15597	-8.623063	12.1671937
-3.581154	-11.363636	-8.956779	-0.06916996	-0.09669506	0.7133656	-22.774901	2.2671573	-6.549407	-186.2372	0.2444664	37.95597	-9.713063	10.8671937
-3.544474	-11.363636	-8.956779	-0.06916996	-0.09669506	0.8623656	-14.374901	2.2671573	-6.549407	-186.2372	0.2444664	40.22597	-7.323063	13.6671937
-3.583674	-11.363636	-8.956779	-0.06916996	-0.09669506	0.1453656	-9.874901	2.2671573	-6.549407	-186.2372	0.2444664	37.44597	-7.443063	6.1671937

# Desviacion estandard
housing.sd <- scale(housing, center = FALSE, scale = TRUE)
head(housing.sd)

A matrix: 6 × 14 of type dbl
CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0.0006773027	0.6936817	0.1764759	0.9485077	1.038710	0.8788221	0.9416646	0.07733878	0.6696347	0.8225709	1.077026	0.3425033	0.9852965
0.0029267624	0.0000000	0.5401231	0.8268590	1.014381	1.0634826	1.1436045	0.15467755	0.5474716	0.9569779	1.077026	0.6286104	0.8867668
0.0029246190	0.0000000	0.5401231	0.8268590	1.135077	0.8235588	1.1436045	0.15467755	0.5474716	0.9569779	1.065982	0.2771663	1.4245745
0.0034690332	0.0000000	0.1665443	0.8074657	1.105535	0.6173321	1.3957357	0.23201633	0.5022260	1.0053645	1.070867	0.2022007	1.3712043
0.0073999612	0.0000000	0.1665443	0.8074657	1.129073	0.7305546	1.3957357	0.23201633	0.5022260	1.0053645	1.077026	0.3665748	1.4861555
0.0031989695	0.0000000	0.1665443	0.8074657	1.015803	0.7912095	1.3957357	0.23201633	0.5022260	1.0053645	1.069483	0.3583217	1.1782504

# NO hace nada
housing.none <- scale(housing, center = FALSE, scale = FALSE)
head(housing.none)

A matrix: 6 × 14 of type dbl
CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0.00632	18	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
0.02731	0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
0.02729	0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
0.03237	0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
0.06905	0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2
0.02985	0	2.18	0.458	6.430	58.7	6.0622	3	222	18.7	394.12	5.21	28.7

#sd = sqrt(sum(x^2)/(n-1))
# funcion para escalar varias columnas
scale.many = function(dataframe, cols){
  names <- names(dataframe)
  for(col in cols){
    name <- paste(names[col], "z", sep = ".")
    dataframe[name] <- scale(dataframe[,col])
  }
  cat(paste("Hemos normalizado ", length(cols), " variable(s)"))
  dataframe
}

housing <- scale.many(housing, c(1, 3, 5:8))

Hemos normalizado  6  variable(s)

Data Binning

Categorizacion de variables numericas

# leer los datos
students <- read.csv("data-conversion.csv")
students

A data.frame: 10 × 5
Age	State	Gender	Height	Income
<int>	<chr>	<chr>	<int>	<int>
23	NJ	F	61	5000
13	NY	M	55	1000
36	NJ	M	66	3000
31	VA	F	64	4000
58	NY	F	70	30000
29	TX	F	63	10000
39	NJ	M	67	50000
50	VA	M	70	55000
23	TX	F	61	2000
36	VA	M	66	20000

# creacion de vectores de separacion
# convertir el income en 3 categorias
bp <- c(-Inf, 10000, 31000, Inf)
names <- c("Low", "Average", "High")

# cut para dividir los datos
students$Income.cat <- cut(students$Income, breaks = bp, labels = names)
students

A data.frame: 10 × 6
Age	State	Gender	Height	Income	Income.cat
<int>	<chr>	<chr>	<int>	<int>	<fct>
23	NJ	F	61	5000	Low
13	NY	M	55	1000	Low
36	NJ	M	66	3000	Low
31	VA	F	64	4000	Low
58	NY	F	70	30000	Average
29	TX	F	63	10000	Low
39	NJ	M	67	50000	High
50	VA	M	70	55000	High
23	TX	F	61	2000	Low
36	VA	M	66	20000	Average

# cortar los datos sin darle nombres
students$Income.cat2 <- cut(students$Income, breaks = bp)
students

A data.frame: 10 × 7
Age	State	Gender	Height	Income	Income.cat	Income.cat2
<int>	<chr>	<chr>	<int>	<int>	<fct>	<fct>
23	NJ	F	61	5000	Low	(-Inf,1e+04]
13	NY	M	55	1000	Low	(-Inf,1e+04]
36	NJ	M	66	3000	Low	(-Inf,1e+04]
31	VA	F	64	4000	Low	(-Inf,1e+04]
58	NY	F	70	30000	Average	(1e+04,3.1e+04]
29	TX	F	63	10000	Low	(-Inf,1e+04]
39	NJ	M	67	50000	High	(3.1e+04, Inf]
50	VA	M	70	55000	High	(3.1e+04, Inf]
23	TX	F	61	2000	Low	(-Inf,1e+04]
36	VA	M	66	20000	Average	(1e+04,3.1e+04]

# podemos poner cualquier nombre para las categorias que se crean
# no es necesario definir donde son los puntos de corte
students$Income.cat3 <- cut(students$Income,
                            breaks = 4,
                            labels = c("Level 1", "Level 2",
                                       "Level 3", "Level 4")
                            )
students

A data.frame: 10 × 8
Age	State	Gender	Height	Income	Income.cat	Income.cat2	Income.cat3
<int>	<chr>	<chr>	<int>	<int>	<fct>	<fct>	<fct>
23	NJ	F	61	5000	Low	(-Inf,1e+04]	Level 1
13	NY	M	55	1000	Low	(-Inf,1e+04]	Level 1
36	NJ	M	66	3000	Low	(-Inf,1e+04]	Level 1
31	VA	F	64	4000	Low	(-Inf,1e+04]	Level 1
58	NY	F	70	30000	Average	(1e+04,3.1e+04]	Level 3
29	TX	F	63	10000	Low	(-Inf,1e+04]	Level 1
39	NJ	M	67	50000	High	(3.1e+04, Inf]	Level 4
50	VA	M	70	55000	High	(3.1e+04, Inf]	Level 4
23	TX	F	61	2000	Low	(-Inf,1e+04]	Level 1
36	VA	M	66	20000	Average	(1e+04,3.1e+04]	Level 2

dummy variables

install.packages("fastDummies")
library(fastDummies)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Thank you for using fastDummies!

To acknowledge our work, please cite the package:

Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.

#dummy variables
students <- read.csv("data-conversion.csv")

students.dummy <- dummy_cols(students)
head(students.dummy)

A data.frame: 6 × 11
	Age	State	Gender	Height	Income	State_NJ	State_NY	State_TX	State_VA	Gender_F	Gender_M
	<int>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
1	23	NJ	F	61	5000	1	0	0	0	1	0
2	13	NY	M	55	1000	0	1	0	0	0	1
3	36	NJ	M	66	3000	1	0	0	0	0	1
4	31	VA	F	64	4000	0	0	0	1	1	0
5	58	NY	F	70	30000	0	1	0	0	1	0
6	29	TX	F	63	10000	0	0	1	0	1	0

# crear una variable dummy de state
dummy_cols(students,
           select_columns = "State")

A data.frame: 10 × 9
Age	State	Gender	Height	Income	State_NJ	State_NY	State_TX	State_VA
<int>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>
23	NJ	F	61	5000	1	0	0	0
13	NY	M	55	1000	0	1	0	0
36	NJ	M	66	3000	1	0	0	0
31	VA	F	64	4000	0	0	0	1
58	NY	F	70	30000	0	1	0	0
29	TX	F	63	10000	0	0	1	0
39	NJ	M	67	50000	1	0	0	0
50	VA	M	70	55000	0	0	0	1
23	TX	F	61	2000	0	0	1	0
36	VA	M	66	20000	0	0	0	1

# Crear dummies columns de state y Gender
dummy_cols(students,
           select_columns = c("State", "Gender"))

A data.frame: 10 × 11
Age	State	Gender	Height	Income	State_NJ	State_NY	State_TX	State_VA	Gender_F	Gender_M
<int>	<chr>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
23	NJ	F	61	5000	1	0	0	0	1	0
13	NY	M	55	1000	0	1	0	0	0	1
36	NJ	M	66	3000	1	0	0	0	0	1
31	VA	F	64	4000	0	0	0	1	1	0
58	NY	F	70	30000	0	1	0	0	1	0
29	TX	F	63	10000	0	0	1	0	1	0
39	NJ	M	67	50000	1	0	0	0	0	1
50	VA	M	70	55000	0	0	0	1	0	1
23	TX	F	61	2000	0	0	1	0	1	0
36	VA	M	66	20000	0	0	0	1	0	1

Outliers

url <- "https://github.com/JoseRZapata/Data_analysis_notebooks/raw/main/data/datasets/ozone.csv"
download.file(url, destfile = "ozone.csv", mode = "wb")

ozone.data <- read.csv("ozone.csv", stringsAsFactors = F) # No convierte NA a string

boxplot(ozone.data$pressure_height,
        main = "Pressure Height",
        boxwex = 0.5)$out

5480
5410
5350
5480
5490
5470
5320
5420
5440
5480
5450
5500

summary(ozone.data$pressure_height)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   5320    5700    5770    5753    5830    5950      12

boxplot(pressure_height ~ Month,
        data = ozone.data,
        main = "Presure Height per Month"
        )

boxplot(ozone_reading ~Month,
        data = ozone.data,
        main = "Ozone reading per Month")$out

11.06
9.93
22.89
24.29
29.79

# funcion para cambiar los outliers por la media y por la mediana
impute_outliers <- function(x, removeNA = TRUE){
  quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
  x[x<quantiles[1]] <- mean(x, na.rm = removeNA)
  x[x>quantiles[2]] <- median(x, na.rm = removeNA)
  x
}

imputed_data <- impute_outliers(ozone.data$pressure_height)

par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers")
boxplot(imputed_data, main = "Presión sin outliers")

replace_outliers <- function(x, removeNA = TRUE){
  qrts <- quantile(x, probs = c(0.25, 0.75), na.rm = removeNA)
  caps <- quantile(x, probs = c(.05, .95), na.rm = removeNA)
  iqr <- qrts[2]-qrts[1]
  h <- 1.5 * iqr
  x[x<qrts[1]-h] <- caps[1]
  x[x>qrts[2]+h] <- caps[2]
  x
}

capped_pressure_height <- replace_outliers(ozone.data$pressure_height)

par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers")
boxplot(capped_pressure_height, main = "Presión sin outliers")

Referencias

https://github.com/sfirke/janitor

Preparación de Datos con R