prediction lineaire

HDV <- read.csv2("HDV_MLIN.csv", row.names ='IDENT' )
str(HDV)

## 'data.frame':    4000 obs. of  7 variables:
##  $ LIRE     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ ORDI     : int  0 0 0 0 0 1 1 1 0 0 ...
##  $ SPORT    : int  1 1 1 0 0 1 0 1 0 1 ...
##  $ GENRE    : int  2 1 1 1 1 1 1 2 2 2 ...
##  $ AGE      : int  60 50 82 58 53 39 22 37 26 25 ...
##  $ NIV_ETUDE: int  40 40 15 0 17 0 0 40 15 40 ...
##  $ HeuresTV : int  2 30 1 3 3 3 0 1 1 0 ...

#Regression logistique

res3<- glm(ORDI~.,data=HDV,family=binomial())
summary(res3)

## 
## Call:
## glm(formula = ORDI ~ ., family = binomial(), data = HDV)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3200  -0.7886  -0.3783   0.8579   2.9900  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.650752   0.205343   8.039 9.06e-16 ***
## LIRE         0.834129   0.087557   9.527  < 2e-16 ***
## SPORT        0.533461   0.081861   6.517 7.19e-11 ***
## GENRE       -0.609796   0.080021  -7.620 2.53e-14 ***
## AGE         -0.064127   0.002890 -22.191  < 2e-16 ***
## NIV_ETUDE    0.037375   0.003187  11.726  < 2e-16 ***
## HeuresTV     0.051683   0.020913   2.471   0.0135 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5233.1  on 3999  degrees of freedom
## Residual deviance: 3962.7  on 3993  degrees of freedom
## AIC: 3976.7
## 
## Number of Fisher Scoring iterations: 5

#toutes les coefficients positifs vont faire augmenter la proba ordi=1(ou il faut que les coef doivent etre positifs) #les variables determinantes sont : lire,sport,genre,age,niv_etude,heuresTV

#la matrice de confusion

table(HDV$ORDI,round (predict.glm(res3, newdata=HDV,type = 'response')))

##    
##        0    1
##   0 2124  431
##   1  606  839

taux

taux<-(431+606)/(2124+839+431+606)
taux

## [1] 0.25925

HDVTest <- read.csv2("hdv_test.csv", row.names ='IDENT')

matrice de confusion

table(HDVTest$ORDI,round (predict.glm(res3, newdata=HDVTest,type = 'response')))

##    
##        0    1
##   0 1962  367
##   1  470  680

print('calcul du taux')

## [1] "calcul du taux"

taux1<-(470+367)/(1962+367+470+680)
taux1

## [1] 0.2405864

dataframe

donnee<-data.frame(c(0,0),c(0,0),c(1,2),c(30,30),c(30,30),c(0,0))

colnames(donnee)<-
c("LIRE","SPORT","GENRE","AGE","NIV_ETUDE","HeuresTV")
  donnee

  predict(res3,newdata = donnee , type = "response")

##         1         2 
## 0.5593205 0.4082051