🍋 ⛓️ 🙅🏾 Mergulhando nas estatísticas com Python. Parte 2. Distribuição do aluno 🍉 🔮 📟

Bom dia, habraledi e habragentelmen! Neste artigo, continuaremos nosso mergulho em estatísticas com Python. Se alguém perdeu o início do mergulho, aqui está um link para a primeira parte . Bem, se não, ainda recomendo manter o livro aberto de Sarah Boslaf, Statistics for All, por perto. Também recomendo executar o bloco de notas para experimentar códigos e gráficos.

Como disse Andrew Lang, “as estatísticas são para um político como um poste de luz para um vagabundo bêbado: um adereço em vez de uma iluminação. ” O mesmo pode ser dito para este artigo para iniciantes. É improvável que você aprenda muitos novos conhecimentos aqui, mas espero que este artigo o ajude a entender como usar Python para facilitar o autoestudo de estatística.

Por que inventar novas alocações?

Imagine ... então, antes de imaginar qualquer coisa, vamos fazer todas as importações necessárias novamente:

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
sns.set()
rcParams['figure.figsize'] = 10, 6
%config InlineBackend.figure_format = 'svg'
np.random.seed(42)

, , . , , - , . 1000 100- , . :

gen_pop = np.trunc(stats.norm.rvs(loc=80, scale=5, size=1000))
gen_pop[gen_pop>100]=100
print(f'mean = {gen_pop.mean():.3}')
print(f'std = {gen_pop.std():.3}')

mean = 79.5
std = 4.95

, , . 80 5 . , , , , , - .

, . , - . , - , ? - . , 10 , :

Z- :

$Z = \ frac {\ bar {x} - \ mu} {\ frac {\ sigma} {\ sqrt {n}}}$

$\ bar {x}$ - , $\ mu$ $\ sigma$ , - . :

sample = np.array([89,99,93,84,79,61,82,81,87,82])
sample.mean()

83.7

Z-:

z = 10**0.5*(sample.mean()-80)/5
z

2.340085468524603

p-value:

1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))

0.019279327322753836

, , : Z- 0 2 , .. 10 , , , 0.02. , 10 , "" $N (80, 5 ^ {2})$ , , 10 "" 83.7 2%. , , , , . .

- 10 , , :

sample.std(ddof=1)

10.055954565441423

ddof std

$\ sigma$ , , , . :

$\ begin {align *} & \ sigma = {\ sqrt {{\ frac {1} {n}} \ sum _ {i = 1} ^ {n} \ left (x_ {i} - {\ mu} \ right ) ^ {2}}} \\ & \\ & s = {\ sqrt {{\ frac {1} {n - 1}} \ sum _ {i = 1} ^ {n} \ left (x_ {i} - {\ bar {x}} \ right) ^ {2}}} \ end {align *}$

, , $\ sigma$ . - $\ mu$ , n - 1 . n - 1 ? , $\ mu$ $\ bar {x}$ - , $\ sigma$ . n - 1 , - .

, std() NumPy ddof, 0, std() , , ddof=1. . , 10000 10 $N (80, 5 ^ {2})$ , , ddof=0 . ddof=1 , - , ddof=0:

fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (12, 5))

for i in [0, 1]:
    deviations = np.std(stats.norm.rvs(80, 5, (10000, 10)), axis=1, ddof=i)
    sns.histplot(x=deviations ,stat='probability', ax=ax[i])
    ax[i].vlines(5, 0, 0.06, color='r', lw=2)
    ax[i].set_title('ddof = ' + str(i), fontsize = 15)
    ax[i].set_ylim(0, 0.07)
    ax[i].set_xlim(0, 11)
fig.suptitle(r'$s={\sqrt {{\frac {1}{10 - \mathrm{ddof}}}\sum _{i=1}^{10}\left(x_{i}-{\bar {x}}\right)^{2}}}$',
             fontsize = 20, y=1.15);

, Z-? , - , . 5000 10 , N (80, 5) :

deviations  = np.std(stats.norm.rvs(80, 5, (5000, 10)), axis=1, ddof=1)
sns.histplot(x=deviations ,stat='probability');

, , 10- . . , , 10 2%, , ( ) 10 0. , , : 10- , .

, , , : - , - , . , "" $N (80, 10 ^ {2})$ , Z- p-value 10- :

z = 10**0.5*(sample.mean()-80)/10
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'z = {z:.3}')
print(f'p-value = {p:.4}')

z = 1.17
p-value = 0.242

, $N (80, 5 ^ {2})$ , , , .. , $N (80, 10 ^ {2})$ , . 2%, 25%. , - $\ sigma$ .

, ? ! , : (, , - )

$Z = \ frac {\ bar {x} - \ mu} {\ frac {\ sigma} {\ sqrt {n}}} \; ; \; \; \; T = \ frac {\ bar {x} - \ mu} {\ frac {s} {\ sqrt {n}}}.$

T-, Z- , $\ sigma$ , . 10000 $N (80, {5} ^ 2)$ , Z- T-, :

fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (12, 5))

N = 10000
samples = stats.norm.rvs(80, 5, (N, 10))
statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
              lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]
title = 'ZT'
bins = np.linspace(-6, 6, 80, endpoint=True)

for i in range(2):
    values = statistics[i](samples)
    sns.histplot(x=values ,stat='probability', bins=bins, ax=ax[i])
    p = values[(values > -2)&(values < 2)].size/N
    ax[i].set_title('P(-2 < {} < 2) = {:.3}'.format(title[i], p))
    ax[i].set_xlim(-6, 6)
    ax[i].vlines([-2, 2], 0, 0.06, color='r');

- :

import matplotlib.animation as animation

fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (18, 8))

def animate(i):
    for ax in axes:
        ax.clear()
    N = 10000
    samples = stats.norm.rvs(80, 5, (N, 10))
    statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
                  lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]
    title = 'ZT'
    bins = np.linspace(-6, 6, 80, endpoint=True)

    for j in range(2):
        values = statistics[j](samples)
        sns.histplot(x=values ,stat='probability', bins=bins, ax=axes[j])
        p = values[(values > -2)&(values < 2)].size/N
        axes[j].set_title(r'$P(-2\sigma < {} < 2\sigma) = {:.3}$'.format(title[j], p))
        axes[j].set_xlim(-6, 6)
        axes[j].set_ylim(0, 0.07)
        axes[j].vlines([-2, 2], 0, 0.06, color='r')
    return axes

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(7),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('statistics_dist.gif',
                 writer='imagemagick', 
                 fps=1)

, , . ? -, , . , , ? , , N (0, 1) $[-2 \ sigma; 2 \ sigma]$ 95.5% . Z- , T- , 92-93% . , , - , :

statistics = [lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/5,
              lambda x: 10**0.5*(np.mean(x, axis=1) - 80)/np.std(x, axis=1, ddof=1)]

quantity = 50
N=10000
result = []
for i in range(quantity):
    samples = stats.norm.rvs(80, 5, (N, 10))
    Z = statistics[0](samples)
    p_z = Z[(Z > -2)&((Z < 2))].size/N
    T = statistics[1](samples)
    p_t = T[(T > -2)&((T < 2))].size/N
    result.append([p_z, p_t])

result = np.array(result)
fig, ax = plt.subplots()

line1, line2 = ax.plot(np.arange(quantity), result)
ax.legend([line1, line2], 
          [r'$P(-2\sigma < {} < 2\sigma)$'.format(i) for i in 'ZT'])
ax.hlines(result.mean(axis=0), 0, 50, color='0.6');

50 . , , , , . ? , ! Z- T- , . , T- ? , - . , , - , , , . , . , - , , $\ sigma$ .

Z-, $\ bar {x}$ , - . 10000 N (80, 5) 10 , :

#     ,
#    svg  png:
#%config InlineBackend.figure_format = 'png'

N = 10000
samples = stats.norm.rvs(80, 5, (N, 10))

means = samples.mean(axis=1)
deviations = samples.std(ddof=1, axis=1)
T = statistics[1](samples)
P = (T > -2)&((T < 2))

fig, ax = plt.subplots()

ax.scatter(means[P], deviations[P], c='b', alpha=0.7,
           label=r'$\left | T \right | < 2\sigma$')
ax.scatter(means[~P], deviations[~P], c='r', alpha=0.7,
           label=r'$\left | T \right | > 2\sigma$')

mean_x = np.linspace(75, 85, 300)
s = np.abs(10**0.5*(mean_x - 80)/2)
ax.plot(mean_x, s, color='k',
        label=r'$\frac{\sqrt{n}(\bar{x}-\mu)}{2}$')
ax.legend(loc = 'upper right', fontsize = 15)
ax.set_title('   \n  ',
             fontsize=15)
ax.set_xlabel(r'   ($\bar{x}$)',
              fontsize=15)
ax.set_ylabel(r'   ($s$)',
              fontsize=15);

, . , , $\ bar {x}$ , .. . , , $N (\ mu, \ sigma ^ {2})$ , $\ left | \ bar {x} - \ mu \ right |$ $\ left | s - \ sigma \ right |$ . , ( ) , :

$\ left | \ bar {x} - \ mu \ right | > \ frac {2 \ sigma s} {\ sqrt {n}}$

, $\ sigma = 1$ , .. , , , $\ mu = 80$ , n = 10 :

$\ left | \ bar {x} - 80 \ right | > \ frac {2s} {\ sqrt {10}}$

, $[-2 \ sigma; 2 \ sigma]$ , , 92,5% .

? , . , ( ) 100- . , , ( ). 10- 82- , 2- . , , . $\ mu = 80$ , , .. $\ sigma = s = 2$ ? Z-:

$Z = \frac{\bar{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{82 - 80}{\frac{2}{\sqrt{10}}} \approx 3.16$

$N(80, 2^{2})$ p-value:

z = 10**0.5*(82-80)/2
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'p-value = {p:.2}')

p-value = 0.0016

10 82- 2%. $N(80, 2^{2})$ . , $\sigma = s = 2$ , , , .

, , , . ( $\left | \bar{x} - \mu \right |$ ) ( ).

10 . 82 , , , 9- . ? :

z = 10**0.5*(82-80)/9
p = 1 - (stats.norm.cdf(z) - stats.norm.cdf(-z))
print(f'p-value = {p:.2}')

p-value = 0.48

10 $\bar{x} = 82$ s = 9 $N(80, 9^{2})$ . , , - .

, , . :

import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 10000
    
    samples = stats.norm.rvs(80, 5, (N, i))

    means = samples.mean(axis=1)
    deviations = samples.std(ddof=1, axis=1)
    T = i**0.5*(np.mean(samples, axis=1) - 80)/np.std(samples, axis=1, ddof=1)
    P = (T > -2)&((T < 2))
    
    prob = T[P].size/N
    ax.set_title(r' $s$  $\bar{x}$  $n = $' + r'${}$'.format(i),
                 fontsize = 20)
    ax.scatter(means[P], deviations[P], c='b', alpha=0.7,
               label=r'$\left | T \right | < 2\sigma$')
    ax.scatter(means[~P], deviations[~P], c='r', alpha=0.7,
               label=r'$\left | T \right | > 2\sigma$')

    mean_x = np.linspace(75, 85, 300)
    s = np.abs(i**0.5*(mean_x - 80)/2)
    ax.plot(mean_x, s, color='k',
            label=r'$\frac{\sqrt{n}(\bar{x}-\mu)}{2}$')
    ax.legend(loc = 'upper right', fontsize = 15)
    ax.set_xlim(70, 90)
    ax.set_ylim(0, 10)
    ax.set_xlabel(r'   ($\bar{x}$)',
              fontsize='20')
    ax.set_ylabel(r'   ($s$)',
              fontsize='20')
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(5, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('sigma_rel.gif',
                 writer='imagemagick', 
                 fps=3)

, $\bar{x}$ $\mu$ $\sigma$ $N(\mu, \sigma^{2})$ , . Z-, .

! , ? - , , . , , 10- :

$\bar{x} = 83.7$ , s = 10.06 , , , , , $N(80, 5^{2})$ . Z-, T-, Z- , $\sigma$ . , - $N(80, 5^{2})$ , $N(80, 10.06^{2})$ - ? ?: , $N(80, 1^{2})$ , $N(80, 5^{2})$ , $N(80, 7^{2})$ $\sigma$ ?

, . , - . , $N(80, 5^{2})$ , , , 10 , 10 . , $N(80, 5^{2})$ . , - , .

: $\bar{x} = 83.7$ , s = 10.06 , $N(80, \sigma^{2})$ $\sigma$ . , , $83<\bar{x}<84$ 9.5<s<10.5 :

N = 10000
sigma = np.linspace(5, 20, 151)
prob = []

for i in sigma:
    p = []
    for j in range(10):
        samples = stats.norm.rvs(80, i, (N, 10))
        means = samples.mean(axis=1)
        deviations = samples.std(ddof=1, axis=1)
        p_m = means[(means >= 83) & (means <= 84)].size/N
        p_d = deviations[(deviations >= 9.5) & (deviations <= 10.5)].size/N
        p.append(p_m*p_d)
    prob.append(sum(p)/len(p))
prob = np.array(prob)

fig, ax = plt.subplots()
ax.plot(sigma, prob)
ax.set_xlabel(r'    ($\sigma$)',
              fontsize=20)
ax.set_ylabel('',
              fontsize=20);

, $\sigma \approx 10$ . , , $\sigma$ , . - , , , , . .

T-?

, - - . 1% , - . , , . , - -. ?

- ! , - , "" t-. , , , . , , 1943 , 50% . , - .

, "" . , ( !) , "" , :

$t = \frac{\bar{x}-\mu}{\frac{s}{\sqrt{n}}}$

t-, . , , "", , , , , - . " ", "t-" , .

$t={\frac {Y_{0}}{{\sqrt {{\frac {1}{n}}\sum \limits _{{i=1}}^{n}Y_{i}^{2}}}}},$

" " . $Y_{i}$ , .. $Y_{i}\sim {\mathcal {N}}(0,1),;i=0,\ldots ,n$ , , .. , . - , :

$t\sim {\mathrm {t}}(n)$

, :

import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 15000
    
    x = np.linspace(-5, 5, 100)
    ax.plot(x, stats.norm.pdf(x, 0, 1), color='r')
    
    samples = stats.norm.rvs(0, 1, (N, i))
    
    t = samples[:, 0]/np.sqrt(np.mean(samples[:, 1:]**2, axis=1))
    t = t[(t>-5)&(t<5)]
    sns.histplot(x=t, bins=np.linspace(-5, 5, 100), stat='density', ax=ax)
    
    ax.set_title(r'  $t(n)$  n = ' + str(i), fontsize = 20)
    ax.set_xlim(-5, 5)
    ax.set_ylim(0, 0.5)
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(2, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('t_rel_of_df.gif',
                 writer='imagemagick', 
                 fps=3)

, , , , N(0, 1) . , , :

${\ displaystyle f_ {t} (y) = {\ frac {\ Gamma \ left ({\ frac {n + 1} {2}} \ right)} {{\ sqrt {n \ pi}} \, \ Gamma \ left ({\ frac {n} {2}} \ right)}} \, \ left (1 + {\ frac {y ^ {2}} {n}} \ right) ^ {- {\ frac {n +1} {2}}}}$

SciPy:

import matplotlib.animation as animation

fig, ax = plt.subplots(figsize = (15, 9))

def animate(i):
    ax.clear()
    N = 15000
    
    x = np.linspace(-5, 5, 100)
    ax.plot(x, stats.norm.pdf(x, 0, 1), color='r')
    ax.plot(x, stats.t.pdf(x, df=i))
    
    ax.set_title(r'  $t(n)$  n = ' + str(i), fontsize = 20)
    ax.set_xlim(-5, 5)
    ax.set_ylim(0, 0.45)
    return ax

dist_animation = animation.FuncAnimation(fig, 
                                      animate, 
                                      frames=np.arange(2, 21),
                                      interval = 200,
                                      repeat = False)

dist_animation.save('t_pdf_rel_of_df.gif',
                 writer='imagemagick', 
                 fps=3)

, ( df ) . - , , , .

t-

t- SciPy :

sample = np.array([89,99,93,84,79,61,82,81,87,82])

stats.ttest_1samp(sample, 80)

Ttest_1sampResult(statistic=1.163532240174695, pvalue=0.2745321678073461)

T = 9**0.5*(sample.mean() -80)/sample.std()
T

1.163532240174695

, n = 10 , , n - 1 . , 1 , , , . :

T = 10**0.5*(sample.mean() -80)/sample.std(ddof=1)
T

1.1635322401746953

, t- , p-value? , - , p-value Z-, t-:

t = stats.t(df=9)
fig, ax = plt.subplots()
x = np.linspace(t.ppf(0.001), t.ppf(0.999), 300)
ax.plot(x, t.pdf(x))
ax.hlines(0, x.min(), x.max(), lw=1, color='k')
ax.vlines([-T, T], 0, 0.4, color='g', lw=2)
x_le_T, x_ge_T = x[x<-T], x[x>T]
ax.fill_between(x_le_T, t.pdf(x_le_T), np.zeros(len(x_le_T)), alpha=0.3, color='b')
ax.fill_between(x_ge_T, t.pdf(x_ge_T), np.zeros(len(x_ge_T)), alpha=0.3, color='b')

p = 1 - (t.cdf(T) - t.cdf(-T))
ax.set_title(r'$P(\left | T \right | \geqslant {:.3}) = {:.3}$'.format(T, p));

, p-value 27%, .. , - , $\ alpha = 0,05$ , p-value , 5 . , , - , $\ alpha$ , 0.95:

$\ textrm {CI} _ {1 - \ alpha} = \ bar {x} \ pm \ left (t _ {\ frac {\ alpha} {2}, \ textrm {df}} \ right) \ left (\ frac {s} {\ sqrt {n}} \ right)$

SciPy, interval loc () scale () :

sample_loc = sample.mean()
sample_scale = sample.std(ddof=1)/10**0.5
ci = stats.t.interval(0.95, df=9, loc=sample_loc, scale=sample_scale)
ci

(76.50640345566619, 90.89359654433382)

, $\ bar {x} = 83,7$ , , $\ alpha = 0,95$ [76,5; 90,9] . , , $\ bar {x}$ , .

, , , ( ). , , t- , t- , t- .

Claro, gostaria de inserir algum gif no final, mas quero encerrar com a frase de Herbert Spencer: “ O maior objetivo da educação não é o conhecimento, mas a ação ”, então lançem suas sucuris e entrem em ação ! Isso é especialmente verdadeiro para pessoas autodidatas como eu.

Eu pressiono F5 e aguardo seus comentários!

Mergulhando nas estatísticas com Python. Parte 2. Distribuição do aluno

Por que inventar novas alocações?

T-?

t-

More articles: