Ling 201, session 03: descr. stats 1 (key)

Author
Affiliation

UC Santa Barbara & JLU Giessen

Published

21 Jan 2025 12-34-56

rm(list=ls(all=TRUE)); library(magrittr)

1 Exercise 01

Compute the mean of the following numbers: 1, 2, 4, 8, 10, 12.

mean(c(1, 2, 4, 8, 10, 12))
[1] 6.166667

2 Exercise 02

Based on their performance in a test, students were awarded the following grades (1=best, 6=worst):

grades.in.test<-rep(      # repeat=
   1:6,                   # the numbers from 1 to 6
   c(2, 15, 10, 4, 4, 2)) # this many times respectively
  1. Compute the most appropriate measure of central tendency.

Given that such grades are typically ordinal, you should use the median:

median(grades.in.test)
[1] 3
  1. Compute the most appropriate measure of dispersion.

Given that such grades are typically ordinal, you should use the interquartile range:

IQR(grades.in.test)
[1] 2

A nice alternative might be the median absolute deviation:

mad(grades.in.test)
[1] 1.4826
  1. Represent the result graphically in a bar plot.
op <- par(mar=c(4, 4, 2, 1)) # customize the plotting margins
(qwe <- barplot(           # make qwe the result of creating a bar plot
   table(grades.in.test))) # of the frequencies of grades.in.test
     [,1]
[1,]  0.7
[2,]  1.9
[3,]  3.1
[4,]  4.3
[5,]  5.5
[6,]  6.7
text(                       # plot text at
   qwe,                     # these x-axis coordinates: middles of 6 bars
   table(grades.in.test)/2, # these y-axis coordinates: half the bar heights
   table(grades.in.test))   # namely this text: the frequencies of grades.in.test / bar heights
text(   # plot text at
   4.9, # this x-axis coordinate
   12,  # this y-axis coordinate
   "Median=3\ninterquartile range=2") # namely this text

par(op)

3 Exercise 03

Load the file _input/disfluencies.csv into a data frame d; you can find information about this data set in _input/disfluencies.r.

summary(d <- read.delim(
   "_input/disfluencies.csv",
   stringsAsFactors=TRUE))
      CASE            SEX          FILLER        GENRE         LENGTH
 Min.   :   1.0   female:502   silence:332   dialog :547   Min.   : 251
 1st Qu.: 250.8   male  :498   uh     :394   monolog:453   1st Qu.: 583
 Median : 500.5                uhm    :274                 Median : 897
 Mean   : 500.5                                            Mean   : 915
 3rd Qu.: 750.2                                            3rd Qu.:1242
 Max.   :1000.0                                            Max.   :1600
    POSITION
 Min.   : 1.00
 1st Qu.: 4.00
 Median : 7.00
 Mean   : 6.57
 3rd Qu.:10.00
 Max.   :12.00  

4 Exercise 04

Determine how many fillers/disfluencies occurred in each genre.

Since each row is one data point for one filler/disfluency occurring in one genre, you just need to tabulate GENRE:

table(d$GENRE)

 dialog monolog
    547     453 

5 Exercise 05

Represent in a graph how many fillers/disfluencies occurred in each genre.

Here’s a dot chart:

op <- par(mar=c(4, 4, 2, 1)) # customize the plotting margins
dotchart(main="The frequency distribution of GENRE", # the main heading
   xlab="Observed frequency",      # x-axis label
   xlim=c(0, sum(table(d$GENRE))), # x-axis limits: c(min, max)
   x=table(d$GENRE),               # what's to be plotted
   ylab="Genre",                   # y-axis label
   pch=16)                         # point character: 16 = filled circle
abline(v=nrow(d)/2, lty=3) # a line representing a random distribution

par(op)

And here’s a bar plot:

op <- par(mar=c(4, 4, 2, 1)) # customize the plotting margins
qwe <- barplot(main="The frequency distribution of GENRE", # the main heading
   xlab="Genre", # x-axis label
   # y-axis label, y-axis limits (min, max), y-values to be plotted, here called "height"
   ylab="Observed frequency", ylim=c(0, sum(table(d$GENRE))), height=table(d$GENRE),
   col=c("lightgrey", "darkgrey")) # color of the bars, strictly speaking unnecessary!
text( # plot text:
   # x-axis coordinates where text will be plotted
   x=qwe,              # the horizontal middles of the bars
   # y-axis coordinates where text will be plotted: half the freqs, i.e.
   y=table(d$GENRE)/2, # the vertical  middles of the bars
   # the text to be plotted: the observed frequencies
   labels=table(d$GENRE))
abline(h=nrow(d)/2, lty=3) # a line representing a random distribution

par(op)

6 Exercise 06

Was the 990th filler/disfluency produced by a man or a woman? We can just look that up directly:

d$SEX[990]
[1] female
Levels: female male

7 Exercise 07

Generate separate summary statistics for the lengths in each of the two genres.

The not-so-great solution (but it is a solution) is to do things stepwise and manually:

summary(     # summarize
   d$LENGTH[ # the values of LENGTH, but only those
      d$GENRE=="dialog"])   # where GENRE is dialog
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  253.0   606.0   906.0   926.5  1263.5  1598.0 
# and the same for monolog:
summary(d$LENGTH[d$GENRE=="monolog"])
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  251.0   567.0   889.0   901.2  1219.0  1600.0 

The better solution uses tapply:

tapply(      # apply to
   d$LENGTH, # the vector LENGTH
   d$GENRE,  # a grouping by GENRE
   summary)  # apply the function summary to each group
$dialog
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  253.0   606.0   906.0   926.5  1263.5  1598.0

$monolog
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  251.0   567.0   889.0   901.2  1219.0  1600.0 

Or, with argument names for a differently ordered description:

tapply(           # apply
   FUN=summary,   # the function summary
   X=d$LENGTH,    # to the vector LENGTH
   INDEX=d$GENRE) # grouped by GENRE
$dialog
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  253.0   606.0   906.0   926.5  1263.5  1598.0

$monolog
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  251.0   567.0   889.0   901.2  1219.0  1600.0 

8 Exercise 08

Determine how many fillers/disfluencies are longer than the average of all fillers/disfluencies.

One of the best solutions is this:

sum(               # sum up the TRUEs when you check
   d$LENGTH>       # whether each length is >
   mean(d$LENGTH)) # the mean of all lengths
[1] 481
# or, with the pipe:
(d$LENGTH>mean(d$LENGTH)) %>% sum
[1] 481
# or even more pipe-y:
d$LENGTH %>% ">"(mean(.)) %>% sum
[1] 481

Some less ideal solutions include the following:

table(                     # count
   d$LENGTH>mean(d$LENGTH) # whether LENGTH is > than average or not
)["TRUE"]                  # but only return the number of TRUEs
TRUE
 481 
table(sign(                # how many cases are there of the signs
   d$LENGTH-mean(d$LENGTH) # of each values difference to the mean
))["1"]                    # but only return the number of positive diffs
  1
481 
length(                  # how many cases are there where
   which(d$LENGTH>       # the value of LENGTH is >
         mean(d$LENGTH)) # the mean of LENGTH
) # end of length() counting positions
[1] 481

9 Exercise 09

Determine who produces more of the fillers/disfluencies that are longer than the average of all fillers/disfluencies, men or women.

The simplest and best solution that returns what is asked for and nothing else is this:

qwe <- table(d$SEX[d$LENGTH>mean(d$LENGTH)])
names(qwe[qwe==max(qwe)])
[1] "female"
# very pipe-y but actually maybe more difficult/convoluted:
qwe %>% "=="(max(.)) %>% qwe[.] %>% names
[1] "female"

10 Exercise 10

What does this do? (If you cannot see that immediately from the plot, execute the function without the barplot plotting part.)

barplot(              # generate a bar plot
   prop.table(        # of a table with proportions
      table(d$FILLER, # of the cross-tabulated frequencies of FILLER
            d$GENRE), # with GENRE
      2               # where the %s sum up to 1 (100%) by column (i.e., GENRE)
   ) # end of prop.table()
) # end of barplot()

Can you re-write this with the pipe?

d$FILLER %>% table(d$GENRE) %>% prop.table(2) %>% barplot

11 Exercise 11

Sort the data frame d

  • according to the factor SEX (ascending) and, within SEX,
  • according to the disfluencies/FILLER (descending) and, within the disfluencies/FILLER,
  • according to the lengths of the fillers/disfluencies (ascending).
order.index <- order(
         d$SEX,      # according to `SEX` (ascending)
   -rank(d$FILLER),  # according to `FILLER` (descending)
         d$LENGTH)   # according to the lengths of the fillers
d <- d[order.index,]

12 Exercise 12

Compute the proportions of the three fillers/disfluencies, their 95% confidence intervals, and discuss briefly what the confidence intervals suggest concerning the different frequencies of the fillers/disfluencies.

table(d$FILLER)             # frequency table of FILLER

silence      uh     uhm
    332     394     274 
prop.table(table(d$FILLER)) # proportion table of FILLER

silence      uh     uhm
  0.332   0.394   0.274 
binom.test(sum(d$FILLER=="silence"), # how many fillers are "silence"
          length(d$FILLER))$conf.int # out of all, just the CI
[1] 0.3028427 0.3621502
attr(,"conf.level")
[1] 0.95
binom.test(sum(d$FILLER=="uh"),      # how many fillers are "uh"
          length(d$FILLER))$conf.int # out of all, just the CI
[1] 0.3635648 0.4250612
attr(,"conf.level")
[1] 0.95
binom.test(sum(d$FILLER=="uhm"),     # how many fillers are "uhm"
          length(d$FILLER))$conf.int # out of all, just the CI
[1] 0.2465556 0.3027818
attr(,"conf.level")
[1] 0.95

(You could get rid of the attribute by computing the CIs as above and add %>% as.numeric to the code; try it.) As for interpretation: The confidence intervals do not overlap so the proportions are significantly different. Very important though: Whether CIs overlap or not is a diagnostic that works ‘only in one way’:

  • if two CIs do not overlap, that means something: you can infer that the proportions (or means) for which they were computed are indeed significantly different;
  • if two CIs do overlap, that means nothing with regard to whether the proportions (or means) are significantly different or not – you would still need to run a dedicated test.

There is a much better version (that you weren’t expected to know about because it uses lapply and an anonymous function; we might get to this later):

qwe <- lapply(         # make qwe the result of applying to each value of
   table(d$FILLER),    # the table of FILLER
   binom.test,         # the function binom.test
   n=length(d$FILLER)) # use length(FILLER) as the argument to binom.test that's called n (the 2nd)
lapply(qwe, # apply to each element of the list qwe
   # an anonymous function extracting the CI:
   # \(af) "[["(af, "conf.int") %>% as.numeric)
   \(af) as.numeric("[["(af, "conf.int")))
$silence
[1] 0.3028427 0.3621502

$uh
[1] 0.3635648 0.4250612

$uhm
[1] 0.2465556 0.3027818

And here’s a (simple percentile) bootstrapping version – for simplicity’s sake, I’m showing only a short version dealing with only one filler/disfluency (silence) and I’m not showing better bootstrapping approaches (that here produce virtually identical results anyway):

collector <- rep(NA, iterations <- 2000) # set up a collector vector
set.seed(123); for (i in seq(iterations)) { # do something iterations times, namely
   # take a sample of FILLER (random with replacement)
   FILLER.sampled <- sample(d$FILLER, length(d$FILLER), replace=TRUE)
   collector[i] <- sum(FILLER.sampled=="silence")/1000
}
quantile(collector, probs=c(0.025, 0.975)) # extract 'CI'
 2.5% 97.5%
0.304 0.362 

13 Exercise 13

Compute the average positions in sentences for the three fillers/disfluencies, their 95% confidence intervals, and discuss briefly what the confidence intervals suggest concerning the different average positions of the fillers/disfluencies.

tapply(        # apply to
   d$POSITION, # the vector of POSITION
   d$FILLER,   # a grouping by FILLER & then
   mean)       # make those grouped values the 1st argum. to the function mean
 silence       uh      uhm
6.665663 6.555838 6.474453 
round(t.test(   # round the result of a t-test on
   x=d$POSITION[d$FILLER=="silence"])$ # the values for whose mean we want the CI
   conf.int, 4) # return only the confidence interval, round to 1 decimal
[1] 6.3029 7.0285
attr(,"conf.level")
[1] 0.95
round(t.test(   # round the result of a t-test on
   x=d$POSITION[d$FILLER=="uh"])$ # the values for whose mean we want the CI
   conf.int, 4) # return only the confidence interval, round to 1 decimal
[1] 6.2078 6.9038
attr(,"conf.level")
[1] 0.95
round(t.test(   # round the result of a t-test on
   x=d$POSITION[d$FILLER=="uhm"])$ # the values for whose mean we want the CI
   conf.int, 4) # return only the confidence interval, round to 1 decimal
[1] 6.0601 6.8889
attr(,"conf.level")
[1] 0.95

The CIs do overlap so, as per the above, we cannot infer whether the means are significantly different or not – you would still need to run a dedicated test. However, here it’s not only the case that the CIs overlap, it’s actually the case that each mean is within the confidence intervals of every other mean, which very strongly suggests that the mean positions will be significantly different from each other (and the correct test would bear that out).

There is a much better version (that you weren’t expected to know about because it uses t.test, lapply and an anonymous function; we might get to this later):

qwe <- tapply(
   d$POSITION, # apply to the values of POSITION
   d$FILLER,   # a grouping by FILLER & then
   t.test)     # make those grouped values the 1st argum. to the function t.test
lapply(qwe, # apply to each element of the list qwe
   # an anonymous function extracting the CI:
   \(af) as.numeric("[["(af, "conf.int")))
$silence
[1] 6.302861 7.028464

$uh
[1] 6.207844 6.903831

$uhm
[1] 6.060054 6.888851

And here’s a (percentile) bootstrapping version – for simplicity’s sake, I am showing only a short version dealing with only one filler/disfluency and I’m not showing better bootstrapping approaches (that again produce virtually identical results here anyway):

set.seed(123) # set a random number generator
collector <- rep(NA, iterations <- 2000) # set up a collector vector
for (i in seq(iterations)) { # do something iterations times, namely
   POSITION_sampled <- sample(         # POSITION_sampled is a sample
      d$POSITION[d$FILLER=="silence"], # of all positions of silences
      sum(d$FILLER=="silence"),        # as many as there are silences
      replace=TRUE)                    # w/ replacement
   collector[i] <- mean(POSITION_sampled)
}
quantile(collector, probs=c(0.025, 0.975)) # extract 'CI'
    2.5%    97.5%
6.313253 7.018072 

14 Homework

To prepare for next week, read (and work through!) SFLWR3: Sections 3.2 to 3.4.

15 Session info

sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Pop!_OS 22.04 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
 [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

time zone: America/Los_Angeles
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  compiler  methods
[8] base

other attached packages:
[1] STGmisc_1.0    Rcpp_1.0.14    magrittr_2.0.3

loaded via a namespace (and not attached):
 [1] digest_0.6.37     fastmap_1.2.0     xfun_0.50         knitr_1.49
 [5] htmltools_0.5.8.1 rmarkdown_2.29    cli_3.6.3         rstudioapi_0.17.1
 [9] tools_4.4.2       evaluate_1.0.3    yaml_2.3.10       rlang_1.1.5
[13] jsonlite_1.8.9    htmlwidgets_1.6.4 MASS_7.3-64