import numpy as np
import matplotlib.pyplot as plt
from scipy.special import binom, erf, erfinv
import scipy.stats
# Donnees
n_parcelles = np.array([5 , 6 , 40 , 168 , 288 , 277 , 165 , 49 , 2])
rendements = np.arange(0,100,10)
N = sum(n_parcelles)
N
1000
rendements_moy = 0.5*(rendements[:-1] + rendements[1:])
mean = (n_parcelles * rendements_moy).sum() / sum(n_parcelles)
var = (n_parcelles * (0.5*(rendements[:-1] + rendements[1:]))**2).sum() / sum(n_parcelles) - mean**2
s = np.sqrt(var)
mean, s
(49.76, 12.827408155976025)
# afffichage des densites
plt.bar(rendements_moy, n_parcelles, label='obs', alpha=0.5, width=9)
plt.bar(rendements_moy, scipy.stats.norm.pdf(rendements_moy, loc=mean, scale=s)*1000*10, label='gauss', alpha=0.5, width=9)
plt.legend();
# calcul des probas selon la loi normale
ps = [scipy.stats.norm.cdf(rendements[1], loc=mean, scale=s)]
for i in range(1, len(rendements)-2):
ps.append(scipy.stats.norm.cdf(rendements[i+1], mean, s) - scipy.stats.norm.cdf(rendements[i], mean, s) )
ps.append(1 - scipy.stats.norm.cdf(rendements[-2], mean, s))
plt.bar(rendements_moy,ps, width=9)
plt.title('probabilites avec hyp. de loi normale')
sum(ps)
1.0
# nombre moyens ou theoriques observes
expected = np.array(ps)*N
expected
array([ 0.96886851, 9.20069917, 51.55560174, 161.64195818, 284.09662192, 280.18532928, 155.05333875, 48.09716736, 9.20041511])
# on regroupe les deux premiers effectifs pour avoir au moins 5
observed = np.concatenate([[n_parcelles[0]+n_parcelles[1]], n_parcelles[2:]])
expected = np.concatenate([[expected[0]+expected[1]], expected[2:]])
with np.printoptions(precision=1):
print(observed, expected)
[ 11 40 168 288 277 165 49 2] [ 10.2 51.6 161.6 284.1 280.2 155.1 48.1 9.2]
Z = ((observed - expected)**2/ expected).sum()
# on a 8 observations et 2 parametres estimes, le nombre de degres de libertes est 8-1-2 = 5
deg_liberte = 8-1-2
f'p_valeur: {(1 - scipy.stats.chi2.cdf(Z, deg_liberte))*100:.1f} %'
'p_valeur: 9.8 %'
# on verifie avec scipy.stats.chisquare
f'p_valeur: {scipy.stats.chisquare(observed, expected, ddof=2)[1]*100:.1f} %'
'p_valeur: 9.8 %'
categorie_film = ['action', 'comédie','familial','horreur']
avec_snack = [50,125,90,45]
sans_snack = [65, 175, 30, 10]
contingence = [avec_snack, sans_snack]
scipy.stats.chi2_contingency(contingence)[1]
3.2645448165668503e-13
survecu_medicament = 17
survecu_placebo = 8
pas_survecu_medicament = 29
pas_survecu_placebo = 38
contingence = [[survecu_medicament, survecu_placebo], [pas_survecu_medicament, pas_survecu_placebo]]
( scipy.stats.chi2_contingency(contingence, correction=True)[1],
scipy.stats.chi2_contingency(contingence, correction=False)[1])
(0.0608074408980542, 0.03492260605235841)
# comment tirer des variables discretes independantes et compter les occurences dans une table de contingence
n = 1000 # nombre d'echantillon par test
alpha = 0.05
px = np.array([0.2, 0.8]) # px = np.random.rand(2);px /= px.sum()
py = np.array([0.35, 0.65]) # py = np.random.rand(2);py /= py.sum()
p_xy_indep = np.dot(px[:, np.newaxis], py[np.newaxis,:])
pi = p_xy_indep.reshape(-1)
contingence = np.random.multinomial(n, pi).reshape((2,2))
contingence
array([[ 67, 140], [265, 528]])
# on repete N fois l'operation et on compte les faux negatifs avec et sans correction
N = 100000 # nombre de tests d'independance
n_reject_nocorr = 0
n_reject_corr = 0
for i in range(N):
contingence = np.random.multinomial(n, pi).reshape((2,2))
p_value_nocorr = scipy.stats.chi2_contingency(contingence, correction=False)[1]
p_value_corr = scipy.stats.chi2_contingency(contingence, correction=True)[1]
n_reject_nocorr += p_value_nocorr < alpha
n_reject_corr += p_value_corr < alpha
print(f'N = {N} experiments with {n} samples, alpha={100*alpha}%, %reject_corr {100*n_reject_corr/N:.2f}, %reject_nocorr {100*n_reject_nocorr/N:.2f}')
N = 100000 experiments with 1000 samples, alpha=5.0%, %reject_corr 4.13, %reject_nocorr 5.04
On constate que la correction de Yates est trop conservative, elle produit moins de faux negatifs que le taux attendu qui doit etre de 5% environ