#Statistique Générale Séance 1 ################################# #Exemple ################################# n=20 tour=85 sigma=2 ################################# #Prenons 20 personnes au hasard ################################# a=rnorm(20,85,2) b=rep(0,20) plot(a,b,cex = .5, col = "dark red") #Résumons le jeu de données summary(a) mean(a) #Trions nos observations sort(a) #Notion de médiane mediane = (sort(a)[10]+sort(a)[11])/2 abline(v = mediane, col = "red") ################################# #Prenons 21 personnes au hasard ################################# a=rnorm(21,85,2) b=rep(0,21) plot(a,b,cex = .5, col = "dark red") summary(a) mean(a) sort(a) mediane = sort(a)[11] abline(v = mediane, col = "red") ################################# #Boîte à  moustaches ################################# par(mfrow=c(2,1)) a=rnorm(21,85,2) b=rep(0,21) mediane = sort(a)[11] plot(a,b,cex = .5, col = "dark red") abline(v = mediane, col = "red") boxplot(a,horizontal=T) abline(v = mediane, col = "red") ################################# #Histogramme, n=200 ################################# par(mfrow=c(2,1)) a=rnorm(200,85,2) b=rep(0,200) plot(a,b,cex = .5, col = "dark red") hist(a) ################################# #Diagramme en bâtons ################################# library(FactoMineR) library(Rcmdr) library(colorspace, pos=4) data(tea) barplot(table(tea$How), xlab="How", ylab="Frequency") ################################# #Diagramme en camembert ################################# pie(table(tea$How), labels=levels(tea$How), main="How", col=rainbow_hcl(length(levels(tea$How)))) ################################# #Distribution normale ################################# x=seq(-4,4,length=200) y=dnorm(x) plot(x,y,type="l", lwd=2, col="blue") abline(v = 1.96, col = "grey") abline(v = -1.96, col = "grey") x=seq(-4,4,length=200) y=dnorm(x) plot(x,y,type="l", lwd=2, col="blue") x=seq(-1.96,1.96,length=200) y=dnorm(x) polygon(c(-2,x,2),c(0,y,0),col="gray") #Pourcentage des réalisations entre -1.96 et 1.96 pnorm(1.96,mean=0,sd=1)-pnorm(-1.96,mean=0,sd=1) ################################# #Notion d'écart-type ################################# #Sur 200 réalisations, le nombre de réalisations inf. à -1.96 et sup. à 1.96 #A essayer sans modération a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) #Regardons ce qui se passe sur 20 échantillons #On va créer un vecteur pour stocker ce nombre de réalisations en dehors des bornes ecarttype=vector(mode = "numeric", length = 20) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[1]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[2]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[3]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[4]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[5]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[6]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[7]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[8]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[9]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[10]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[11]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[12]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[13]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[14]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[15]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[16]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[17]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[18]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[19]=sum(amean(a)+1.96*sd(a)) a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[20]=sum(amean(a)+1.96*sd(a)) #On calcule la moyenne de ce nombre de réalisations mean(ecarttype) #Le même en "programmant" for (i in 1:20){ a=rnorm(200,85,2) sum(amean(a)+1.96*sd(a)) ecarttype[i]=sum(amean(a)+1.96*sd(a)) } mean(ecarttype) ################################# #Centrage ################################# par(mfrow=c(2,1)) a=rnorm(20,85,2) b=rep(0,20) plot(a,b,cex = .5, col = "dark red") abline(v = mean(a), col = "grey") acentre=a-mean(a) plot(acentre,b,cex = .5, col = "dark red") abline(v = 0, col = "grey") ################################# #Réduction ################################# par(mfrow=c(1,1)) b=rep(0,20) a=rnorm(20,85,2) acentre=a-mean(a) plot(acentre,b,cex = .5, col = "dark red") abline(v = 0, col = "grey") areduit=(a-mean(a))/sd(a) b=rep(0.5,20) points(areduit,b,cex = .5, col = "blue") ################################# #La moyenne des moyennes tend vers la vraie moyenne ################################# par(mfrow=c(1,1)) #A faire tourner a=rnorm(20,85,2) b=rep(1,20) plot(a,b,cex = .5, col = "dark red") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "dark red") abline(v = 85, col = "grey") #Regardons ce qui se passe sur 11 échantillons #On va créer un vecteur pour stocker ce nombre de réalisations en dehors des bornes moyennes=vector(mode = "numeric", length = 11) a=rnorm(20,85,2) b=rep(1,20) moyennes[1]=mean(a) plot(a,b,cex = .5, col = "dark red") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "dark red") a=rnorm(20,85,2) b=rep(1.025,20) moyennes[2]=mean(a) points(a,b,cex = .5, col = "pink") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "pink") a=rnorm(20,85,2) b=rep(1.05,20) moyennes[3]=mean(a) points(a,b,cex = .5) points(x=mean(a),y=0.8,pch=24,cex=0.5) a=rnorm(20,85,2) b=rep(1.075,20) moyennes[4]=mean(a) points(a,b,cex = .5, col = "green") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "green") a=rnorm(20,85,2) b=rep(1.1,20) moyennes[5]=mean(a) points(a,b,cex = .5, col = "purple") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "purple") a=rnorm(20,85,2) b=rep(1.125,20) moyennes[6]=mean(a) points(a,b,cex = .5, col = "tomato") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "tomato") a=rnorm(20,85,2) b=rep(1.15,20) moyennes[7]=mean(a) points(a,b,cex = .5, col = "plum") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "plum") a=rnorm(20,85,2) b=rep(1.175,20) moyennes[8]=mean(a) points(a,b,cex = .5, col = "gold") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "gold") a=rnorm(20,85,2) b=rep(1.2,20) moyennes[9]=mean(a) points(a,b,cex = .5, col = "dimgrey") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "dimgrey") a=rnorm(20,85,2) b=rep(1.225,20) moyennes[10]=mean(a) points(a,b,cex = .5, col = "aquamarine") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "aquamarine") a=rnorm(20,85,2) b=rep(1.25,20) moyennes[11]=mean(a) points(a,b,cex = .5, col = "firebrick1") points(x=mean(a),y=0.8,pch=24,cex=0.5,col = "firebrick1") mean(moyennes) abline(v = mean(moyennes), col = "red") #Par simulations moyennes=vector(mode = "numeric", length = 100) for (i in 1:100){ a=rnorm(20,85,2) moyennes[i]=mean(a) } mean(moyennes) ################################# #Si la distribution mère n'est pas normale? ################################# a=rchisq(100, df=5) b=rep(0,100) plot(a,b,cex = .5, col = "firebrick1") hist(a) .x <- seq(0.158, 22.105, length.out=100) plot(.x, dchisq(.x, df=5), xlab="x", ylab="Density", main=paste("ChiSquared Distribution: Degrees of freedom=5"),type="l") abline(h=0, col="gray") #Simulons 150 échantillons suivant une loi du Chi-deux moyennes=vector(mode = "numeric", length = 150) for (i in 1:150){ a=rchisq(150, df=5) moyennes[i]=mean(a) } hist(moyennes)