#Funciones a usar
source("lecturaDatos.R")
datos<- lecturaDatos("./data","datos.csv")
instancias <- nrow(datos)
instancias
## [1] 20000
variables <- ncol(datos)
variables
## [1] 51
Vamos a ver la estructura de los datos para ello, podemos usar funciones como:
#Vemos las priemeras 5
head(datos, 5)
## separation propensity length PredSS_r1_.1 PredSS_r1 PredSS_r1_1
## 1 27 0.636381526 161 E E E
## 2 63 0.702477191 187 H H C
## 3 19 -0.892407793 185 C H H
## 4 36 0.161958287 495 C H H
## 5 145 -0.005589508 405 C C C
## PredSS_r2_.1 PredSS_r2 PrSS_fq_cn_H PrSS_fq_cn_E PrSS_fq_cn_C
## 1 C C 0.269 0.115 0.615
## 2 H H 0.694 0.000 0.306
## 3 C C 0.333 0.000 0.667
## 4 E E 0.229 0.086 0.686
## 5 H H 0.354 0.125 0.521
## PrCN_fq_cn_0 PrRCH_fq_cn0 PrRCH_fq_cn1 PrSA_fq_cn_0 PrSA_fq_cn_4
## 1 0.308 0.346 0.192 0.308 0.385
## 2 0.161 0.194 0.161 0.290 0.161
## 3 0.056 0.222 0.056 0.278 0.167
## 4 0.229 0.343 0.086 0.200 0.286
## 5 0.062 0.167 0.076 0.396 0.097
## PrRCH_r1_.1 PrRCH_r1 PrRCH_r1_1 PrRCH_r2_.1 PrRCH_r2 PrRCH_r2_1
## 1 3 3 4 3 3 0
## 2 3 0 1 3 3 3
## 3 3 1 3 0 3 0
## 4 3 3 3 3 3 3
## 5 4 4 3 3 3 0
## PrCN_r1_.1 PrCN_r1 PrCN_r1_1 PrCN_r2 PrSA_r1 PrSA_r2_.1 PrSA_r2
## 1 4 4 4 3 0 3 1
## 2 3 1 1 1 4 3 3
## 3 3 3 4 3 3 4 1
## 4 3 3 1 4 1 0 0
## 5 <NA> 4 3 3 1 0 2
## PrSA_r2_1 PrRCH_fq_gl0 PrRCH_fq_gl4 AA_fq_cn_A AA_fq_cn_D AA_fq_cn_E
## 1 4 0.230 0.230 0.115 0.077 0.038
## 2 0 0.209 0.128 0.177 0.081 0.097
## 3 3 0.346 0.038 0.222 0.000 0.000
## 4 0 0.285 0.115 0.057 0.029 0.057
## 5 4 0.151 0.316 0.083 0.035 0.062
## AA_fq_cn_I AA_fq_cn_F PrSS_fq_gl_H PrCN_fq_gl_0 PrSA_fq_gl_0
## 1 0.077 0.038 0.130 0.106 0.354
## 2 0.081 0.016 0.775 0.209 0.278
## 3 0.056 0.000 0.416 0.205 0.151
## 4 0.029 0.029 0.168 0.186 0.182
## 5 0.049 0.035 0.375 0.067 0.383
## PSSM_r1_.4_A PSSM_r1_.4_N PSSM_r1_0_D PSSM_r1_1_W PSSM_r2_0_A
## 1 4 4 -6 -4 -5
## 2 2 -3 NA -7 4
## 3 -1 2 1 -2 -1
## 4 0 1 -2 -2 -5
## 5 -5 -4 -5 -1 -4
## PSSM_cn_.2_A PSSM_cn_.2_T PSSM_cn_0_H PSSM_cn_2_D PSSM_cn_2_V class
## 1 2 -5 2 -5 -5 negative
## 2 7 -3 1 0 -10 negative
## 3 -1 -1 -2 5 -3 positive
## 4 -3 -3 0 3 -1 positive
## 5 5 0 -1 -5 6 negative
#Vemos las 5 ultimas
tail(datos,5)
## separation propensity length PredSS_r1_.1 PredSS_r1 PredSS_r1_1
## 19996 66 0.03331837 642 H H H
## 19997 22 -0.57829209 431 C E E
## 19998 100 -0.94417931 193 H H H
## 19999 13 -1.62414913 143 H H C
## 20000 129 -0.12260333 172 C C C
## PredSS_r2_.1 PredSS_r2 PrSS_fq_cn_H PrSS_fq_cn_E PrSS_fq_cn_C
## 19996 H H 0.738 0.062 0.200
## 19997 C C 0.000 0.524 0.476
## 19998 C C 0.434 0.263 0.303
## 19999 C C 0.000 0.417 0.583
## 20000 C C 0.016 0.234 0.750
## PrCN_fq_cn_0 PrRCH_fq_cn0 PrRCH_fq_cn1 PrSA_fq_cn_0 PrSA_fq_cn_4
## 19996 0.123 0.154 0.092 0.292 0.108
## 19997 0.143 0.381 0.095 0.190 0.429
## 19998 0.222 0.283 0.111 0.202 0.222
## 19999 0.083 0.250 0.083 0.333 0.250
## 20000 0.438 0.547 0.125 0.117 NA
## PrRCH_r1_.1 PrRCH_r1 PrRCH_r1_1 PrRCH_r2_.1 PrRCH_r2 PrRCH_r2_1
## 19996 1 3 3 3 3 3
## 19997 3 3 3 0 0 3
## 19998 3 0 1 0 0 0
## 19999 1 0 1 0 0 2
## 20000 0 0 0 0 0 3
## PrCN_r1_.1 PrCN_r1 PrCN_r1_1 PrCN_r2 PrSA_r1 PrSA_r2_.1 PrSA_r2
## 19996 1 3 3 4 0 3 0
## 19997 3 3 3 1 3 4 4
## 19998 2 0 1 0 3 4 4
## 19999 2 0 1 1 4 4 4
## 20000 0 0 0 0 4 4 4
## PrSA_r2_1 PrRCH_fq_gl0 PrRCH_fq_gl4 AA_fq_cn_A AA_fq_cn_D AA_fq_cn_E
## 19996 0 0.212 0.235 0.154 0.046 0.123
## 19997 4 0.362 0.095 0.000 0.143 0.095
## 19998 3 0.249 0.067 0.091 0.040 0.091
## 19999 3 0.259 0.000 0.083 0.083 0.167
## 20000 4 0.628 0.000 0.102 0.031 0.062
## AA_fq_cn_I AA_fq_cn_F PrSS_fq_gl_H PrCN_fq_gl_0 PrSA_fq_gl_0
## 19996 0.046 0.015 0.358 0.187 0.274
## 19997 0.000 0.048 0.009 0.172 0.139
## 19998 0.030 0.030 0.280 0.223 0.223
## 19999 0.000 0.000 0.322 0.189 0.245
## 20000 NA 0.039 0.012 0.547 0.087
## PSSM_r1_.4_A PSSM_r1_.4_N PSSM_r1_0_D PSSM_r1_1_W PSSM_r2_0_A
## 19996 -1 -2 -6 -6 2
## 19997 0 1 -1 0 -1
## 19998 0 -2 0 -5 1
## 19999 0 1 2 -2 -3
## 20000 -1 -2 0 -1 2
## PSSM_cn_.2_A PSSM_cn_.2_T PSSM_cn_0_H PSSM_cn_2_D PSSM_cn_2_V
## 19996 6 -2 -5 -4 -1
## 19997 -1 -1 -5 -2 0
## 19998 -5 -5 1 -4 0
## 19999 -1 0 -2 -5 -1
## 20000 -3 0 -4 -1 1
## class
## 19996 negative
## 19997 negative
## 19998 negative
## 19999 negative
## 20000 negative
#Nombres de las variables
names(datos)
## [1] "separation" "propensity" "length" "PredSS_r1_.1"
## [5] "PredSS_r1" "PredSS_r1_1" "PredSS_r2_.1" "PredSS_r2"
## [9] "PrSS_fq_cn_H" "PrSS_fq_cn_E" "PrSS_fq_cn_C" "PrCN_fq_cn_0"
## [13] "PrRCH_fq_cn0" "PrRCH_fq_cn1" "PrSA_fq_cn_0" "PrSA_fq_cn_4"
## [17] "PrRCH_r1_.1" "PrRCH_r1" "PrRCH_r1_1" "PrRCH_r2_.1"
## [21] "PrRCH_r2" "PrRCH_r2_1" "PrCN_r1_.1" "PrCN_r1"
## [25] "PrCN_r1_1" "PrCN_r2" "PrSA_r1" "PrSA_r2_.1"
## [29] "PrSA_r2" "PrSA_r2_1" "PrRCH_fq_gl0" "PrRCH_fq_gl4"
## [33] "AA_fq_cn_A" "AA_fq_cn_D" "AA_fq_cn_E" "AA_fq_cn_I"
## [37] "AA_fq_cn_F" "PrSS_fq_gl_H" "PrCN_fq_gl_0" "PrSA_fq_gl_0"
## [41] "PSSM_r1_.4_A" "PSSM_r1_.4_N" "PSSM_r1_0_D" "PSSM_r1_1_W"
## [45] "PSSM_r2_0_A" "PSSM_cn_.2_A" "PSSM_cn_.2_T" "PSSM_cn_0_H"
## [49] "PSSM_cn_2_D" "PSSM_cn_2_V" "class"
Para ver un reumen de las variables y sus datos estadísticos:
summary(datos)
## separation propensity length PredSS_r1_.1
## Min. : 6.00 Min. :-1.88816 Min. : 50.0 C :9556
## 1st Qu.: 26.00 1st Qu.:-0.51366 1st Qu.: 144.0 E :4625
## Median : 57.00 Median :-0.16855 Median : 209.0 H :5434
## Mean : 93.67 Mean :-0.07537 Mean : 286.4 X : 185
## 3rd Qu.: 114.00 3rd Qu.: 0.21625 3rd Qu.: 356.0 NA's: 200
## Max. :1221.00 Max. : 2.63886 Max. :1244.0
##
## PredSS_r1 PredSS_r1_1 PredSS_r2_.1 PredSS_r2 PrSS_fq_cn_H
## C:9631 C :9545 C :9433 C:9560 Min. :0.0000
## E:4841 E :4785 E :4766 E:4823 1st Qu.:0.0000
## H:5528 H :5470 H :5601 H:5617 Median :0.2690
## NA's: 200 NA's: 200 Mean :0.2833
## 3rd Qu.:0.4290
## Max. :1.0000
## NA's :200
## PrSS_fq_cn_E PrSS_fq_cn_C PrCN_fq_cn_0 PrRCH_fq_cn0
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0630 1st Qu.:0.391 1st Qu.:0.0970 1st Qu.:0.199
## Median :0.1740 Median :0.497 Median :0.1670 Median :0.263
## Mean :0.2166 Mean :0.500 Mean :0.1965 Mean :0.275
## 3rd Qu.:0.3330 3rd Qu.:0.600 3rd Qu.:0.2640 3rd Qu.:0.341
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.000
## NA's :200 NA's :200 NA's :200 NA's :200
## PrRCH_fq_cn1 PrSA_fq_cn_0 PrSA_fq_cn_4 PrRCH_r1_.1
## Min. :0.0000 Min. :0.0000 Min. :0.0000 0 :5181
## 1st Qu.:0.0940 1st Qu.:0.1500 1st Qu.:0.1450 1 :2563
## Median :0.1300 Median :0.2220 Median :0.2180 2 : 833
## Mean :0.1396 Mean :0.2313 Mean :0.2441 3 :8486
## 3rd Qu.:0.1780 3rd Qu.:0.3020 3rd Qu.:0.3150 4 :2551
## Max. :0.8000 Max. :1.0000 Max. :1.0000 X : 186
## NA's :200 NA's :200 NA's :200 NA's: 200
## PrRCH_r1 PrRCH_r1_1 PrRCH_r2_.1 PrRCH_r2
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:1.00
## Median :3.000 Median :3.000 Median :3.000 Median :3.00
## Mean :2.079 Mean :2.036 Mean :2.018 Mean :2.07
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.00
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.00
## NA's :200 NA's :200
## PrRCH_r2_1 PrCN_r1_.1 PrCN_r1 PrCN_r1_1 PrCN_r2
## 0 :5252 0 :3634 Min. :0.000 Min. :0.000 Min. :0.00
## 1 :2718 1 :3089 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## 2 : 863 2 :1575 Median :3.000 Median :3.000 Median :3.00
## 3 :8370 3 :5999 Mean :2.366 Mean :2.306 Mean :2.36
## 4 :2405 4 :5316 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00
## X : 192 X : 187 Max. :4.000 Max. :4.000 Max. :4.00
## NA's: 200 NA's: 200 NA's :200
## PrSA_r1 PrSA_r2_.1 PrSA_r2 PrSA_r2_1
## Min. :0.00 Min. :0.000 Min. :0.000 0 :4713
## 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.000 1 :2879
## Median :3.00 Median :3.000 Median :3.000 2 :1136
## Mean :2.07 Mean :2.158 Mean :2.076 3 :6305
## 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.:3.000 4 :4575
## Max. :4.00 Max. :4.000 Max. :4.000 X : 192
## NA's :200 NA's: 200
## PrRCH_fq_gl0 PrRCH_fq_gl4 AA_fq_cn_A AA_fq_cn_D
## Min. :0.0570 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.2280 1st Qu.:0.0240 1st Qu.:0.04300 1st Qu.:0.03400
## Median :0.2740 Median :0.1000 Median :0.07600 Median :0.05700
## Mean :0.2844 Mean :0.1141 Mean :0.07983 Mean :0.05917
## 3rd Qu.:0.3390 3rd Qu.:0.1830 3rd Qu.:0.10800 3rd Qu.:0.07700
## Max. :0.7900 Max. :0.6340 Max. :0.66700 Max. :0.66700
## NA's :200 NA's :200 NA's :200 NA's :200
## AA_fq_cn_E AA_fq_cn_I AA_fq_cn_F PrSS_fq_gl_H
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.03800 1st Qu.:0.02800 1st Qu.:0.01500 1st Qu.:0.1630
## Median :0.06300 Median :0.05100 Median :0.03600 Median :0.2860
## Mean :0.06772 Mean :0.05479 Mean :0.03988 Mean :0.2887
## 3rd Qu.:0.09000 3rd Qu.:0.07500 3rd Qu.:0.05600 3rd Qu.:0.3910
## Max. :0.50000 Max. :0.40000 Max. :0.40000 Max. :0.9700
## NA's :200 NA's :200 NA's :200 NA's :200
## PrCN_fq_gl_0 PrSA_fq_gl_0 PSSM_r1_.4_A PSSM_r1_.4_N
## Min. :0.0130 Min. :0.0000 Min. :-11.0000 Min. :-12.000
## 1st Qu.:0.1340 1st Qu.:0.1690 1st Qu.: -2.0000 1st Qu.: -3.000
## Median :0.1810 Median :0.2210 Median : -1.0000 Median : -1.000
## Mean :0.2061 Mean :0.2288 Mean : -0.8582 Mean : -1.513
## 3rd Qu.:0.2620 3rd Qu.:0.2780 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. :0.7900 Max. :0.6940 Max. : 7.0000 Max. : 9.000
## NA's :200 NA's :200 NA's :200 NA's :200
## PSSM_r1_0_D PSSM_r1_1_W PSSM_r2_0_A PSSM_cn_.2_A
## Min. :-13.000 Min. :-12.000 Min. :-12.0000 Min. :-13.000
## 1st Qu.: -4.000 1st Qu.: -5.000 1st Qu.: -2.0000 1st Qu.: -2.000
## Median : -2.000 Median : -4.000 Median : -1.0000 Median : -1.000
## Mean : -1.919 Mean : -3.052 Mean : -0.8988 Mean : -0.872
## 3rd Qu.: 0.000 3rd Qu.: -2.000 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. : 9.000 Max. : 13.000 Max. : 7.0000 Max. : 8.000
## NA's :200 NA's :200 NA's :200 NA's :200
## PSSM_cn_.2_T PSSM_cn_0_H PSSM_cn_2_D PSSM_cn_2_V
## Min. :-13.0000 Min. :-12.000 Min. :-14.000 Min. :-13.000
## 1st Qu.: -2.0000 1st Qu.: -3.000 1st Qu.: -4.000 1st Qu.: -3.000
## Median : -1.0000 Median : -2.000 Median : -2.000 Median : -2.000
## Mean : -0.9923 Mean : -1.697 Mean : -1.667 Mean : -1.393
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.: 0.000
## Max. : 8.0000 Max. : 12.000 Max. : 9.000 Max. : 8.000
## NA's :200 NA's :200 NA's :200 NA's :200
## class
## negative:14010
## positive: 5990
##
##
##
##
##
Una función similar es describe del paquete Hmisc:
library("Hmisc")
describe(datos[1:3])
## datos[1:3]
##
## 3 Variables 20000 Observations
## ---------------------------------------------------------------------------
## separation
## n missing distinct Info Mean Gmd .05 .10
## 20000 0 706 1 93.67 99.4 9.0 13.0
## .25 .50 .75 .90 .95
## 26.0 57.0 114.0 210.1 308.0
##
## lowest : 6 7 8 9 10, highest: 1098 1142 1165 1166 1221
## ---------------------------------------------------------------------------
## propensity
## n missing distinct Info Mean Gmd .05 .10
## 20000 0 400 1 -0.07537 0.6824 -0.8201 -0.7404
## .25 .50 .75 .90 .95
## -0.5137 -0.1685 0.2162 0.8423 1.1630
##
## lowest : -1.888157 -1.624149 -1.444294 -1.442635 -1.434026
## highest: 1.356595 1.402798 1.410586 1.513896 2.638864
## ---------------------------------------------------------------------------
## length
## n missing distinct Info Mean Gmd .05 .10
## 20000 0 337 1 286.4 216.7 88 106
## .25 .50 .75 .90 .95
## 144 209 356 555 764
##
## lowest : 50 51 52 53 54, highest: 764 822 865 954 1244
## ---------------------------------------------------------------------------
Con esta primera toma de contacto podemos ver el tipo de datos, si tenemos valores perdidos, sus distribuciones…
El primer paso sería ver si estos valores perdidos son aleatorios o si siguen alguna distribución. De igual modo, para manejar esto podríamos utilizar estas aproximaciones:
El primer paso será obtener el porcentaje de variables con datos perdidos en cada instancia:
res<-apply(datos, 1, function(x) sum(is.na(x)))/ncol(datos)*100
res
## [1] 0.000000 1.960784 0.000000 0.000000 1.960784 0.000000 0.000000
## [8] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784
## [15] 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784
## [22] 1.960784 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [29] 0.000000 0.000000 3.921569 1.960784 0.000000 1.960784 1.960784
## [36] 5.882353 3.921569 0.000000 0.000000 0.000000 1.960784 1.960784
## [43] 1.960784 3.921569 0.000000 0.000000 1.960784 0.000000 0.000000
## [50] 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784 1.960784
## [57] 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784
## [64] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000
## [71] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [78] 0.000000 3.921569 1.960784 0.000000 1.960784 1.960784 0.000000
## [85] 0.000000 3.921569 0.000000 1.960784 0.000000 0.000000 0.000000
## [92] 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000
## [99] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [106] 0.000000 0.000000 0.000000 0.000000 0.000000 3.921569 0.000000
## [113] 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000 1.960784
## [120] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784
## [127] 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784 0.000000
## [134] 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [141] 3.921569 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [148] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [155] 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000 1.960784
## [162] 1.960784 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000
## [169] 1.960784 0.000000 1.960784 3.921569 1.960784 1.960784 0.000000
## [176] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [183] 0.000000 0.000000 5.882353 0.000000 0.000000 1.960784 0.000000
## [190] 3.921569 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [197] 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [204] 0.000000 0.000000 0.000000 1.960784 0.000000 3.921569 0.000000
## [211] 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000 0.000000
## [218] 0.000000 0.000000 1.960784 1.960784 0.000000 1.960784 0.000000
## [225] 1.960784 3.921569 0.000000 1.960784 0.000000 0.000000 1.960784
## [232] 3.921569 1.960784 1.960784 1.960784 0.000000 0.000000 0.000000
## [239] 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784 0.000000
## [246] 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000
## [253] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784
## [260] 1.960784 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000
## [267] 0.000000 0.000000 3.921569 0.000000 0.000000 0.000000 1.960784
## [274] 0.000000 1.960784 3.921569 1.960784 1.960784 0.000000 0.000000
## [281] 0.000000 0.000000 1.960784 0.000000 3.921569 0.000000 1.960784
## [288] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [295] 1.960784 3.921569 0.000000 0.000000 0.000000 1.960784 7.843137
## [302] 0.000000 1.960784 1.960784 1.960784 0.000000 0.000000 0.000000
## [309] 0.000000 0.000000 3.921569 0.000000 0.000000 0.000000 3.921569
## [316] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [323] 3.921569 0.000000 0.000000 0.000000 3.921569 0.000000 0.000000
## [330] 0.000000 1.960784 3.921569 0.000000 0.000000 1.960784 0.000000
## [337] 1.960784 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784
## [344] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [351] 3.921569 0.000000 1.960784 1.960784 0.000000 3.921569 0.000000
## [358] 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000
## [365] 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784 0.000000
## [372] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784
## [379] 0.000000 0.000000 0.000000 0.000000 0.000000 3.921569 1.960784
## [386] 0.000000 1.960784 0.000000 1.960784 0.000000 0.000000 0.000000
## [393] 1.960784 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [400] 0.000000 0.000000 1.960784 0.000000 0.000000 3.921569 0.000000
## [407] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [414] 1.960784 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [421] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [428] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [435] 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784
## [442] 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000
## [449] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [456] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [463] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000
## [470] 0.000000 1.960784 0.000000 1.960784 0.000000 0.000000 0.000000
## [477] 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000
## [484] 5.882353 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000
## [491] 1.960784 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000
## [498] 1.960784 0.000000 0.000000 3.921569 1.960784 0.000000 0.000000
## [505] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784
## [512] 1.960784 3.921569 0.000000 0.000000 0.000000 0.000000 0.000000
## [519] 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784 0.000000
## [526] 1.960784 0.000000 1.960784 0.000000 0.000000 1.960784 1.960784
## [533] 1.960784 1.960784 0.000000 3.921569 0.000000 1.960784 1.960784
## [540] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [547] 3.921569 0.000000 0.000000 0.000000 0.000000 3.921569 0.000000
## [554] 0.000000 5.882353 0.000000 0.000000 0.000000 0.000000 0.000000
## [561] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [568] 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784
## [575] 1.960784 0.000000 0.000000 0.000000 1.960784 1.960784 0.000000
## [582] 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000
## [589] 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784
## [596] 0.000000 0.000000 5.882353 1.960784 0.000000 0.000000 0.000000
## [603] 1.960784 1.960784 0.000000 0.000000 0.000000 3.921569 1.960784
## [610] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [617] 1.960784 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000
## [624] 0.000000 0.000000 1.960784 0.000000 1.960784 0.000000 0.000000
## [631] 1.960784 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000
## [638] 0.000000 0.000000 1.960784 0.000000 0.000000 1.960784 0.000000
## [645] 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000 1.960784
## [652] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [659] 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000 0.000000
## [666] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [673] 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000
## [680] 0.000000 1.960784 0.000000 0.000000 0.000000 3.921569 1.960784
## [687] 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784 0.000000
## [694] 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000
## [701] 1.960784 3.921569 0.000000 3.921569 1.960784 0.000000 0.000000
## [708] 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784 1.960784
## [715] 0.000000 3.921569 0.000000 0.000000 0.000000 5.882353 1.960784
## [722] 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784
## [729] 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784 1.960784
## [736] 0.000000 1.960784 1.960784 0.000000 0.000000 1.960784 0.000000
## [743] 3.921569 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [750] 3.921569 1.960784 0.000000 0.000000 0.000000 3.921569 1.960784
## [757] 1.960784 0.000000 3.921569 1.960784 0.000000 1.960784 1.960784
## [764] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.960784
## [771] 1.960784 0.000000 1.960784 0.000000 3.921569 0.000000 0.000000
## [778] 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000
## [785] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [792] 1.960784 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000
## [799] 1.960784 3.921569 0.000000 3.921569 0.000000 1.960784 0.000000
## [806] 0.000000 3.921569 0.000000 0.000000 0.000000 0.000000 1.960784
## [813] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [820] 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784
## [827] 0.000000 0.000000 1.960784 1.960784 0.000000 0.000000 3.921569
## [834] 0.000000 1.960784 1.960784 0.000000 0.000000 0.000000 0.000000
## [841] 0.000000 0.000000 0.000000 0.000000 1.960784 3.921569 0.000000
## [848] 1.960784 0.000000 1.960784 1.960784 0.000000 1.960784 0.000000
## [855] 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [862] 0.000000 0.000000 3.921569 0.000000 0.000000 1.960784 0.000000
## [869] 1.960784 0.000000 0.000000 0.000000 0.000000 1.960784 0.000000
## [876] 0.000000 0.000000 0.000000 0.000000 3.921569 0.000000 1.960784
## [883] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [890] 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000 1.960784
## [897] 1.960784 1.960784 1.960784 0.000000 3.921569 0.000000 1.960784
## [904] 1.960784 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000
## [911] 3.921569 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000
## [918] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [925] 3.921569 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [932] 1.960784 0.000000 0.000000 0.000000 0.000000 3.921569 0.000000
## [939] 3.921569 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [946] 1.960784 0.000000 1.960784 0.000000 0.000000 1.960784 0.000000
## [953] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [960] 0.000000 0.000000 0.000000 1.960784 0.000000 0.000000 0.000000
## [967] 1.960784 0.000000 0.000000 0.000000 1.960784 0.000000 1.960784
## [974] 0.000000 1.960784 0.000000 0.000000 0.000000 0.000000 0.000000
## [981] 0.000000 1.960784 0.000000 1.960784 0.000000 1.960784 1.960784
## [988] 0.000000 0.000000 3.921569 0.000000 1.960784 0.000000 0.000000
## [995] 0.000000 1.960784 1.960784 0.000000 1.960784 0.000000
## [ reached getOption("max.print") -- omitted 19000 entries ]
Vamos a marcar como instancias “malas” aquellas que tienen mas de un 5% de valores perdidos.
mal<-(res>5)
filtrados<- datos[!mal,]
filtrados
## separation propensity length PredSS_r1_.1 PredSS_r1 PredSS_r1_1
## 1 27 0.636381526 161 E E E
## 2 63 0.702477191 187 H H C
## 3 19 -0.892407793 185 C H H
## 4 36 0.161958287 495 C H H
## 5 145 -0.005589508 405 C C C
## 6 170 -0.278919206 412 H H H
## 7 112 -0.806811662 244 C C C
## 8 25 0.339083541 134 C C C
## 9 12 0.145021229 148 C C C
## 10 63 1.356594948 342 E E C
## 11 134 -0.498801547 492 C C C
## 12 416 -0.260871551 509 H H H
## 13 61 1.111784575 204 E E E
## 14 168 -1.060356684 319 C H H
## 15 185 -0.082983638 437 C C C
## 16 69 -0.152034578 156 H H H
## 17 53 -0.278919206 134 E E E
## 18 32 -0.264777001 367 E E C
## 19 43 -0.043197934 1244 C C C
## PredSS_r2_.1 PredSS_r2 PrSS_fq_cn_H PrSS_fq_cn_E PrSS_fq_cn_C
## 1 C C 0.269 0.115 0.615
## 2 H H 0.694 0.000 0.306
## 3 C C 0.333 0.000 0.667
## 4 E E 0.229 0.086 0.686
## 5 H H 0.354 0.125 0.521
## 6 C C 0.249 0.047 0.704
## 7 E E 0.180 0.459 0.360
## 8 C C 0.000 0.500 0.500
## 9 C C 0.000 0.182 0.818
## 10 C C 0.194 0.177 0.629
## 11 C H 0.263 0.173 0.564
## 12 E E 0.186 0.210 0.605
## 13 E E 0.200 0.400 0.400
## 14 C C 0.413 0.156 0.431
## 15 C C 0.239 0.310 0.451
## 16 H H 0.132 0.368 0.500
## 17 C C 0.000 0.404 0.596
## 18 C C 0.323 0.129 0.548
## 19 H H 0.262 0.000 0.738
## PrCN_fq_cn_0 PrRCH_fq_cn0 PrRCH_fq_cn1 PrSA_fq_cn_0 PrSA_fq_cn_4
## 1 0.308 0.346 0.192 0.308 0.385
## 2 0.161 0.194 0.161 0.290 0.161
## 3 0.056 0.222 0.056 0.278 0.167
## 4 0.229 0.343 0.086 0.200 0.286
## 5 0.062 0.167 0.076 0.396 0.097
## 6 0.077 0.101 0.107 0.456 0.095
## 7 0.171 0.324 0.189 0.189 0.261
## 8 0.000 0.167 0.042 0.458 0.125
## 9 0.545 0.455 0.000 0.000 0.818
## 10 0.048 0.097 0.161 0.290 0.097
## 11 0.188 0.316 0.128 0.211 0.278
## 12 0.311 0.349 0.099 0.142 0.373
## 13 0.083 0.183 0.133 0.317 0.217
## 14 0.066 0.162 0.096 0.413 0.108
## 15 0.114 0.234 0.158 0.288 0.174
## 16 0.221 0.324 0.132 0.221 0.324
## 17 0.385 0.442 0.154 0.135 0.519
## 18 0.355 0.484 0.226 0.161 0.355
## 19 0.286 0.286 0.167 0.167 0.262
## PrRCH_r1_.1 PrRCH_r1 PrRCH_r1_1 PrRCH_r2_.1 PrRCH_r2 PrRCH_r2_1
## 1 3 3 4 3 3 0
## 2 3 0 1 3 3 3
## 3 3 1 3 0 3 0
## 4 3 3 3 3 3 3
## 5 4 4 3 3 3 0
## 6 2 0 1 4 4 4
## 7 0 0 3 2 3 0
## 8 4 4 3 4 4 4
## 9 0 0 3 3 3 3
## 10 4 4 4 1 3 3
## 11 1 3 3 0 1 3
## 12 3 0 0 4 4 3
## 13 3 3 3 1 3 3
## 14 0 0 0 3 3 3
## 15 0 1 3 3 3 0
## 16 1 3 1 3 0 3
## 17 3 3 3 0 1 0
## 18 3 3 0 0 1 3
## 19 3 3 3 3 3 0
## PrCN_r1_.1 PrCN_r1 PrCN_r1_1 PrCN_r2 PrSA_r1 PrSA_r2_.1 PrSA_r2
## 1 4 4 4 3 0 3 1
## 2 3 1 1 1 4 3 3
## 3 3 3 4 3 3 4 1
## 4 3 3 1 4 1 0 0
## 5 <NA> 4 3 3 1 0 2
## 6 2 0 1 4 3 1 0
## 7 1 0 3 4 4 3 0
## 8 4 4 4 4 1 0 1
## 9 1 0 3 3 4 3 0
## 10 4 4 4 4 0 3 0
## 11 1 3 3 0 1 4 4
## 12 2 0 0 4 4 0 1
## 13 4 4 4 3 0 3 1
## 14 1 0 0 3 4 0 2
## 15 1 1 3 3 3 3 1
## 16 1 3 1 1 0 0 3
## 17 4 3 4 3 3 4 4
## 18 3 3 1 1 0 4 3
## 19 4 3 3 3 1 0 1
## PrSA_r2_1 PrRCH_fq_gl0 PrRCH_fq_gl4 AA_fq_cn_A AA_fq_cn_D AA_fq_cn_E
## 1 4 0.230 0.230 0.115 0.077 0.038
## 2 0 0.209 0.128 0.177 0.081 0.097
## 3 3 0.346 0.038 0.222 0.000 0.000
## 4 0 0.285 0.115 0.057 0.029 0.057
## 5 4 0.151 0.316 0.083 0.035 0.062
## 6 0 0.126 0.413 0.160 0.030 0.041
## 7 3 0.299 0.020 0.045 0.063 0.117
## 8 0 0.216 0.239 0.042 0.042 0.042
## 9 4 0.372 0.020 0.091 0.091 0.000
## 10 0 0.228 0.216 0.097 0.081 0.000
## 11 3 0.323 0.128 0.053 0.060 0.045
## 12 0 0.344 0.134 0.077 0.075 0.060
## 13 2 0.230 0.093 0.100 0.033 0.083
## 14 0 0.163 0.279 0.138 0.054 0.066
## 15 3 0.197 0.233 0.054 0.076 0.098
## 16 1 0.365 0.000 0.029 0.059 0.029
## 17 4 0.396 0.000 0.038 0.058 0.096
## 18 2 0.213 0.253 0.097 0.065 0.097
## 19 3 0.198 0.270 0.143 0.119 0.071
## AA_fq_cn_I AA_fq_cn_F PrSS_fq_gl_H PrCN_fq_gl_0 PrSA_fq_gl_0
## 1 0.077 0.038 0.130 0.106 0.354
## 2 0.081 0.016 0.775 0.209 0.278
## 3 0.056 0.000 0.416 0.205 0.151
## 4 0.029 0.029 0.168 0.186 0.182
## 5 0.049 0.035 0.375 0.067 0.383
## 6 0.077 0.018 0.376 0.073 0.398
## 7 0.126 0.063 0.168 0.123 0.238
## 8 0.083 0.083 0.306 0.142 0.328
## 9 0.000 0.000 0.101 0.250 0.081
## 10 0.097 0.000 0.237 0.117 0.307
## 11 0.060 0.090 0.417 0.244 0.177
## 12 0.065 0.036 0.161 0.299 0.130
## 13 0.083 0.017 0.397 0.206 0.260
## 14 0.072 0.030 0.417 0.075 0.392
## 15 0.087 0.033 0.249 0.069 0.343
## 16 0.029 0.074 0.173 0.295 0.115
## 17 0.038 0.019 0.067 0.351 0.164
## 18 0.065 0.000 0.302 0.172 0.332
## 19 0.071 0.048 0.211 0.160 0.260
## PSSM_r1_.4_A PSSM_r1_.4_N PSSM_r1_0_D PSSM_r1_1_W PSSM_r2_0_A
## 1 4 4 -6 -4 -5
## 2 2 -3 NA -7 4
## 3 -1 2 1 -2 -1
## 4 0 1 -2 -2 -5
## 5 -5 -4 -5 -1 -4
## 6 3 -2 3 -4 -3
## 7 -2 1 3 -2 3
## 8 6 -1 -6 -6 5
## 9 0 -4 -1 2 -2
## 10 -2 -3 -4 -6 -3
## 11 -2 1 -3 1 0
## 12 1 -1 3 -2 1
## 13 -2 1 -6 -6 -3
## 14 3 -2 NA -5 5
## 15 0 0 3 -5 -2
## 16 -2 0 -5 -6 -1
## 17 -2 -4 -2 -3 1
## 18 -3 -5 -5 -4 -1
## 19 -1 -3 -5 1 0
## PSSM_cn_.2_A PSSM_cn_.2_T PSSM_cn_0_H PSSM_cn_2_D PSSM_cn_2_V
## 1 2 -5 2 -5 -5
## 2 7 -3 1 0 -10
## 3 -1 -1 -2 5 -3
## 4 -3 -3 0 3 -1
## 5 5 0 -1 -5 6
## 6 1 -3 0 -5 -4
## 7 3 -1 1 -5 -1
## 8 0 1 2 -2 -1
## 9 -1 -1 -9 -1 -3
## 10 4 -1 1 -4 1
## 11 -3 NA -8 -7 4
## 12 1 -1 -4 -4 0
## 13 0 -1 -3 2 0
## 14 -2 -4 -7 -6 -2
## 15 -1 4 0 -3 0
## 16 -1 -2 -1 -5 -6
## 17 -2 -2 -2 1 -3
## 18 -1 1 -5 1 -3
## 19 0 -1 -2 -2 2
## class
## 1 negative
## 2 negative
## 3 positive
## 4 positive
## 5 negative
## 6 negative
## 7 negative
## 8 positive
## 9 positive
## 10 positive
## 11 negative
## 12 positive
## 13 negative
## 14 negative
## 15 negative
## 16 positive
## 17 negative
## 18 negative
## 19 negative
## [ reached getOption("max.print") -- omitted 19854 rows ]
Ahora escribimos los datos, para evitar tener que volver a re-hacer todo en posteriores etapas del proceso.
escrituraDatos("./data", "datosFiltrados.csv", filtrados)
Para obtener mejores resultados, podemos utilizar versión paralela de apply:
library(parallel)
cores <- detectCores()
cluster <- makeCluster(cores-2)
system.time(parRapply(cluster, datos, function(x) sum(is.na(x)))/ncol(datos)*100)
## user system elapsed
## 0.039 0.011 0.771
stopCluster(cluster)
También podemos obtener el patron de valores perdidos, que nos dirá si estos son aleatorios o no.
library(mice)
datos <- airquality
patron<- md.pattern(x=datos)
patron
## Wind Temp Month Day Solar.R Ozone
## 111 1 1 1 1 1 1 0
## 35 1 1 1 1 1 0 1
## 5 1 1 1 1 0 1 1
## 2 1 1 1 1 0 0 2
## 0 0 0 0 7 37 44
Vemos que hay 2 instancias donde falla el valor de solar.R y ozone. 5 en las que no aparece el valor de Solar.R, 35 instancias en las que no aparece el valor ozone y 111 completas.
Esto podemos verlo tambien gráficamente para ello:
require(VIM)
plot <- aggr(datos, col=c('blue', 'red'), numbers=TRUE,
sortVars=T, labels=names(data), cex.axix=.5,
gap=1, ylab=c("Graficos de datos perdidos","Patron"))
##
## Variables sorted by number of missings:
## Variable Count
## Ozone 0.24183007
## Solar.R 0.04575163
## Wind 0.00000000
## Temp 0.00000000
## Month 0.00000000
## Day 0.00000000
El anterior gráfico nos muestra lo mismo que la funcion md.pattern pero de manera gráfica.
También podemos ver la distribución de una variable respecto de la otra.
marginplot(datos[,1:2])
Podemos considerar la distribución de los valores perdidos como aleatoria. Vamos a imputar los valores perdidos
library(mice)
library(lattice)
# se determina el numero de instancias sin datos perdidos y con datos
# perdidos. A observar la comodidad de uso de las funciones ncc e nic
completos <- mice::ncc(datos)
incompletos <- mice::nic(datos)
cat("Datos completos: ",completos, " e incompletos: ",incompletos,"\n")
## Datos completos: 111 e incompletos: 42
# se realiza la imputacion con el método pmm
imputados <- mice::mice(datos, m=5, meth="pmm")
##
## iter imp variable
## 1 1 Ozone Solar.R
## 1 2 Ozone Solar.R
## 1 3 Ozone Solar.R
## 1 4 Ozone Solar.R
## 1 5 Ozone Solar.R
## 2 1 Ozone Solar.R
## 2 2 Ozone Solar.R
## 2 3 Ozone Solar.R
## 2 4 Ozone Solar.R
## 2 5 Ozone Solar.R
## 3 1 Ozone Solar.R
## 3 2 Ozone Solar.R
## 3 3 Ozone Solar.R
## 3 4 Ozone Solar.R
## 3 5 Ozone Solar.R
## 4 1 Ozone Solar.R
## 4 2 Ozone Solar.R
## 4 3 Ozone Solar.R
## 4 4 Ozone Solar.R
## 4 5 Ozone Solar.R
## 5 1 Ozone Solar.R
## 5 2 Ozone Solar.R
## 5 3 Ozone Solar.R
## 5 4 Ozone Solar.R
## 5 5 Ozone Solar.R
# Tambien se dispone de algunos metodos que imputan siempre a un unico valor, como por ejemplo "mean"
imputadosMean <- mice::mice(datos, m=1, meth="mean")
##
## iter imp variable
## 1 1 Ozone Solar.R
## 2 1 Ozone Solar.R
## 3 1 Ozone Solar.R
## 4 1 Ozone Solar.R
## 5 1 Ozone Solar.R
#pmm es el metodo por defecto. Puedes verse todos los metodos disponibles de la siguiente forma
methods(mice)
## Warning in .S3methods(generic.function, class, parent.frame()): function
## 'mice' appears not to be S3 generic; found functions that look like S3
## methods
## [1] mice.impute.2l.lmer mice.impute.2l.norm
## [3] mice.impute.2l.pan mice.impute.2lonly.mean
## [5] mice.impute.2lonly.norm mice.impute.2lonly.pmm
## [7] mice.impute.cart mice.impute.lda
## [9] mice.impute.logreg mice.impute.logreg.boot
## [11] mice.impute.mean mice.impute.midastouch
## [13] mice.impute.norm mice.impute.norm.boot
## [15] mice.impute.norm.nob mice.impute.norm.predict
## [17] mice.impute.passive mice.impute.pmm
## [19] mice.impute.polr mice.impute.polyreg
## [21] mice.impute.quadratic mice.impute.rf
## [23] mice.impute.ri mice.impute.sample
## [25] mice.mids mice.theme
## see '?methods' for accessing help and source code
#se completa el conjunto de datos con las imputaciones
datosImputados <- mice::complete(imputados)
Por último pueden realizarse gráficos para ver como se ha comportado la imputación de los datos.
# perdidos en la parte ya limpia
completos <- mice::ncc(datosImputados)
incompletos <- mice::nic(datosImputados)
cat("Datos completos: ",completos, " e incompletos: ",incompletos,"\n")
## Datos completos: 153 e incompletos: 0
# se muestra la imputacion para Ozone
imputados$imp$Ozone
## 1 2 3 4 5
## 5 19 14 18 19 18
## 10 23 23 11 16 14
## 25 8 18 8 6 6
## 26 19 13 37 19 37
## 27 37 20 21 9 13
## 32 47 16 39 52 44
## 33 36 36 36 13 13
## 34 28 13 37 18 32
## 35 37 66 63 59 47
## 36 96 108 89 64 59
## 37 44 13 14 30 30
## 39 82 78 50 82 168
## 42 76 78 61 79 80
## 43 76 77 78 91 82
## 45 44 65 16 18 65
## 46 52 52 28 16 28
## 52 44 63 23 35 46
## 53 20 96 64 37 35
## 54 44 49 59 23 32
## 55 47 71 23 63 47
## 56 65 59 39 45 36
## 57 45 40 20 44 29
## 58 12 30 27 11 21
## 59 23 45 36 45 23
## 60 32 12 14 21 14
## 61 78 64 39 39 49
## 65 28 52 45 44 31
## 72 7 47 32 16 44
## 75 73 35 20 71 59
## 83 35 35 40 40 63
## 84 35 32 29 44 47
## 102 79 168 80 66 85
## 103 28 46 39 7 45
## 107 23 11 30 16 22
## 115 14 12 12 14 22
## 119 80 168 78 135 168
## 150 34 7 21 21 12
# Se muestra un grafico para comprobar la distribucion de Ozone en los datos imputados en relacion a otras variables. Los puntos en azul repreentan datos observados y datos en rojo representan imputaciones
lattice::xyplot(imputados,Ozone ~ Solar.R,pch=18,cex=1)
# Se muestran las densidades de los datos imputados respecto de los observados
lattice::densityplot(imputados)
# Se muestran los diagramas de caja para las imputaciones
lattice::bwplot(imputados)
Otro paquete para imputar es rob compositions que utiliza KNN para la imputación. Este método tambien permite realizar gráficos sobre la imputación.
require(robCompositions)
# se hace la imputacion
imputados <- robCompositions::impKNNa(datos)
#Ahora puede visualizarse alguna informacion sobre la forma en que se hizo la imputacion. El segundo argumento indica el tipo de grafico a obtener
plot(imputados, which=2)
##
## Click on a coordinate axis to highlight missings of the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
# El conjunto de datos completo puede accederse de la siguiente forma
imputados$xImp
## Ozone Solar.R Wind Temp Month Day
## [1,] 41.000000 190.00000 7.4 67 5 1
## [2,] 36.000000 118.00000 8.0 72 5 2
## [3,] 12.000000 149.00000 12.6 74 5 3
## [4,] 18.000000 313.00000 11.5 62 5 4
## [5,] 21.054545 250.99507 14.3 56 5 5
## [6,] 28.000000 106.89130 14.9 66 5 6
## [7,] 23.000000 299.00000 8.6 65 5 7
## [8,] 19.000000 99.00000 13.8 59 5 8
## [9,] 8.000000 19.00000 20.1 61 5 9
## [10,] 20.588235 194.00000 8.6 69 5 10
## [11,] 7.000000 15.41284 6.9 74 5 11
## [12,] 16.000000 256.00000 9.7 69 5 12
## [13,] 11.000000 290.00000 9.2 66 5 13
## [14,] 14.000000 274.00000 10.9 68 5 14
## [15,] 18.000000 65.00000 13.2 58 5 15
## [16,] 14.000000 334.00000 11.5 64 5 16
## [17,] 34.000000 307.00000 12.0 66 5 17
## [18,] 6.000000 78.00000 18.4 57 5 18
## [19,] 30.000000 322.00000 11.5 68 5 19
## [20,] 11.000000 44.00000 9.7 62 5 20
## [21,] 1.000000 8.00000 9.7 59 5 21
## [22,] 11.000000 320.00000 16.6 73 5 22
## [23,] 4.000000 25.00000 9.7 61 5 23
## [24,] 32.000000 92.00000 12.0 61 5 24
## [25,] 30.000000 66.00000 16.6 57 5 25
## [26,] 40.344828 266.00000 14.9 58 5 26
## [27,] 4.281346 44.10188 8.0 57 5 27
## [28,] 23.000000 13.00000 12.0 67 5 28
## [29,] 45.000000 252.00000 14.9 81 5 29
## [30,] 115.000000 223.00000 5.7 79 5 30
## [31,] 37.000000 279.00000 7.4 76 5 31
## [32,] 47.648649 286.00000 8.6 78 6 1
## [33,] 51.663043 287.00000 9.7 74 6 2
## [34,] 25.200000 242.00000 16.1 67 6 3
## [35,] 32.000000 186.00000 9.2 84 6 4
## [36,] 31.559633 220.00000 8.6 85 6 5
## [37,] 52.477064 264.00000 14.3 79 6 6
## [38,] 29.000000 127.00000 9.7 82 6 7
## [39,] 88.000000 273.00000 6.9 87 6 8
## [40,] 71.000000 291.00000 13.8 90 6 9
## [41,] 39.000000 323.00000 11.5 87 6 10
## [42,] 23.692308 259.00000 10.9 93 6 11
## [43,] 24.705882 250.00000 9.2 92 6 12
## [44,] 23.000000 148.00000 8.0 82 6 13
## [45,] 14.000000 332.00000 13.8 80 6 14
## [46,] 15.000000 322.00000 11.5 79 6 15
## [47,] 21.000000 191.00000 14.9 77 6 16
## [48,] 37.000000 284.00000 20.7 72 6 17
## [49,] 20.000000 37.00000 9.2 65 6 18
## [50,] 12.000000 120.00000 11.5 73 6 19
## [51,] 13.000000 137.00000 10.3 76 6 20
## [52,] 58.961538 150.00000 6.3 77 6 21
## [53,] 89.517241 59.00000 1.7 76 6 22
## [54,] 17.692308 91.00000 4.6 76 6 23
## [55,] 51.200000 250.00000 6.3 76 6 24
## [56,] 67.857143 135.00000 8.0 75 6 25
## [57,] 16.900000 127.00000 8.0 78 6 26
## [58,] 14.850000 47.00000 10.3 73 6 27
## [59,] 37.333333 98.00000 11.5 80 6 28
## [60,] 11.863636 31.00000 14.9 77 6 29
## [61,] 81.428571 138.00000 8.0 83 6 30
## [62,] 135.000000 269.00000 4.1 84 7 1
## [63,] 49.000000 248.00000 9.2 85 7 2
## [64,] 32.000000 236.00000 9.2 81 7 3
## [65,] 22.503226 101.00000 10.9 84 7 4
## [66,] 64.000000 175.00000 4.6 83 7 5
## [67,] 40.000000 314.00000 10.9 83 7 6
## [68,] 77.000000 276.00000 5.1 88 7 7
## [69,] 97.000000 267.00000 6.3 92 7 8
## [70,] 97.000000 272.00000 5.7 92 7 9
## [71,] 85.000000 175.00000 7.4 89 7 10
## [72,] 40.333333 139.00000 8.6 82 7 11
## [73,] 10.000000 264.00000 14.3 73 7 12
## [74,] 27.000000 175.00000 14.9 81 7 13
## [75,] 19.866667 291.00000 14.9 91 7 14
## [76,] 7.000000 48.00000 14.3 80 7 15
## [77,] 48.000000 260.00000 6.9 81 7 16
## [78,] 35.000000 274.00000 10.3 82 7 17
## [79,] 61.000000 285.00000 6.3 84 7 18
## [80,] 79.000000 187.00000 5.1 87 7 19
## [81,] 63.000000 220.00000 11.5 85 7 20
## [82,] 16.000000 7.00000 6.9 74 7 21
## [83,] 45.294118 258.00000 9.7 81 7 22
## [84,] 47.352941 295.00000 11.5 82 7 23
## [85,] 80.000000 294.00000 8.6 86 7 24
## [86,] 108.000000 223.00000 8.0 85 7 25
## [87,] 20.000000 81.00000 8.6 82 7 26
## [88,] 52.000000 82.00000 12.0 86 7 27
## [89,] 82.000000 213.00000 7.4 88 7 28
## [90,] 50.000000 275.00000 7.4 86 7 29
## [91,] 64.000000 253.00000 7.4 83 7 30
## [92,] 59.000000 254.00000 9.2 81 7 31
## [93,] 39.000000 83.00000 6.9 81 8 1
## [94,] 9.000000 24.00000 13.8 81 8 2
## [95,] 16.000000 77.00000 7.4 82 8 3
## [96,] 78.000000 168.00000 6.9 86 8 4
## [97,] 35.000000 205.21739 7.4 85 8 5
## [98,] 66.000000 267.00000 4.6 87 8 6
## [99,] 122.000000 255.00000 4.0 89 8 7
## [100,] 89.000000 229.00000 10.3 90 8 8
## [101,] 110.000000 207.00000 8.0 90 8 9
## [102,] 86.407767 222.00000 8.6 92 8 10
## [103,] 42.166667 137.00000 11.5 86 8 11
## [104,] 44.000000 192.00000 11.5 86 8 12
## [105,] 28.000000 273.00000 11.5 82 8 13
## [106,] 65.000000 157.00000 9.7 80 8 14
## [107,] 16.666667 64.00000 11.5 79 8 15
## [108,] 22.000000 71.00000 10.3 77 8 16
## [109,] 59.000000 51.00000 6.3 79 8 17
## [110,] 23.000000 115.00000 7.4 76 8 18
## [111,] 31.000000 244.00000 10.9 78 8 19
## [112,] 44.000000 190.00000 10.3 78 8 20
## [113,] 21.000000 259.00000 15.5 77 8 21
## [114,] 9.000000 36.00000 14.3 72 8 22
## [115,] 23.000000 255.00000 12.6 75 8 23
## [116,] 45.000000 212.00000 9.7 79 8 24
## [117,] 168.000000 238.00000 3.4 81 8 25
## [118,] 73.000000 215.00000 8.0 86 8 26
## [119,] 75.600000 153.00000 5.7 88 8 27
## [120,] 76.000000 203.00000 9.7 97 8 28
## [121,] 118.000000 225.00000 2.3 94 8 29
## [122,] 84.000000 237.00000 6.3 96 8 30
## [123,] 85.000000 188.00000 6.3 94 8 31
## [124,] 96.000000 167.00000 6.9 91 9 1
## [125,] 78.000000 197.00000 5.1 92 9 2
## [126,] 73.000000 183.00000 2.8 93 9 3
## [127,] 91.000000 189.00000 4.6 93 9 4
## [128,] 47.000000 95.00000 7.4 87 9 5
## [129,] 32.000000 92.00000 15.5 84 9 6
## [130,] 20.000000 252.00000 10.9 80 9 7
## [131,] 23.000000 220.00000 10.3 78 9 8
## [132,] 21.000000 230.00000 10.9 75 9 9
## [133,] 24.000000 259.00000 9.7 73 9 10
## [134,] 44.000000 236.00000 14.9 81 9 11
## [135,] 21.000000 259.00000 15.5 76 9 12
## [136,] 28.000000 238.00000 6.3 77 9 13
## [137,] 9.000000 24.00000 10.9 71 9 14
## [138,] 13.000000 112.00000 11.5 71 9 15
## [139,] 46.000000 237.00000 6.9 78 9 16
## [140,] 18.000000 224.00000 13.8 67 9 17
## [141,] 13.000000 27.00000 10.3 76 9 18
## [142,] 24.000000 238.00000 10.3 68 9 19
## [143,] 16.000000 201.00000 8.0 82 9 20
## [144,] 13.000000 238.00000 12.6 64 9 21
## [145,] 23.000000 14.00000 9.2 71 9 22
## [146,] 36.000000 139.00000 10.3 81 9 23
## [147,] 7.000000 49.00000 10.3 69 9 24
## [148,] 14.000000 20.00000 16.6 63 9 25
## [149,] 30.000000 193.00000 6.9 70 9 26
## [150,] 17.052632 145.00000 13.2 77 9 27
## [151,] 14.000000 191.00000 14.3 75 9 28
## [152,] 18.000000 131.00000 8.0 76 9 29
## [153,] 20.000000 223.00000 11.5 68 9 30
En este caso, se muestra en azul todas las instancias que antes existian y en rojo las nuevas, si vemos que sigue una distribución similar y que no se salen, la imputación habra tenido un buen resultado.