#+ fig.width=12, fig.height=8
#### Chapter 3: Descriptive univariate statistics
# (1) Create the following numeric vector -- 1, 2, 4, 8, 10, 12 -- and compute summary/descriptive statistics of its raw version, its centered version, and its z-standardized version.
(qwe.raw <- c(1, 2, 4, 8, 10, 12))
round(c(summary(qwe.raw) , "sd"=sd(qwe.raw) , "IQR"=IQR(qwe.raw)) , 3)
(qwe.cent <- qwe.raw-mean(qwe.raw))
round(c(summary(qwe.cent) , "sd"=sd(qwe.cent) , "IQR"=IQR(qwe.cent)) , 3)
(qwe.stand <- qwe.cent/sd(qwe.raw))
round(c(summary(qwe.stand), "sd"=sd(qwe.stand), "IQR"=IQR(qwe.stand)), 3)
# (2) Based on their performance in a test, 37 students were awarded the following grades (1=best, 6=worst):
grades.in.test <- rep( # make grades.in.test the repetitions of
1:6, # the numbers from 1 to 6
c(2, 15, 10, 4, 4, 2)) # this many times respectively
# as a dataframe: grades.in.test.df <- data.frame(GRADES=grades.in.test)
# (a) Compute the most appropriate measure of central tendency.
median(grades.in.test) # since such grades are ordinal data, you should use the median
# (b) Compute the most appropriate measure of dispersion.
IQR(grades.in.test) # since such grades are ordinal data, you should use the interquartile range
# (c) Represent the frequency distribution of the grades graphically in a dot chart.
dotchart(
main="Frequencies of grades in a test", # the main heading
xlab="Observed frequency", # x-axis label
xlim=c(0, 40), # x-axis limits: c(min, max)
x=table(grades.in.test), # what's to be plotted
ylab="Grades", # y-axis label
pch=16) # point character: 16 = filled circle
grid() # add a grid to the plot
# (d) Represent the frequency distribution of the grades graphically in a bar plot.
(qwe <- barplot( # make qwe the result of creating a bar plot
table(grades.in.test), # of the frequencies of grades.in.test
space=0)) # with no space between bars
text( # plot text at
qwe, # these x-axis coordinates: middles of 6 bars
table(grades.in.test)/2, # these y-axis coordinates: half the bar heights
table(grades.in.test)) # namely this text: the frequencies of grades.in.test / bar heights
text( # plot text at
4, # this x-axis coordinate
12, # this y-axis coordinate
"Median=3\ninterquartile range=2") # namely this text: more info
# (3) Load the file <104_03_uh(m).csv> into a dataframe UHM
summary(UHM <- read.delim("104_03_uh(m).csv", stringsAsFactors=TRUE))
attach(UHM)
# (4) Sort the data frame according to the factor SEX (ascending) and, within SEX, according to the disfluencies (descending) and, within disfluencies, according to the lengths of the disfluencies (ascending).
order.index <- order( # make order.index the vector that orders
SEX, # by SEX (in the default ascending order)
-rank(FILLER), # by FILLER (in descending order)
LENGTH) # by LENGTH (in the default ascending order)
UHM.2 <- UHM[ # make UHM.2 the result of ordering UHM
order.index,] # row-wise, by order.index
# (5) Compute the 95% confidence intervals for the proportions of the three disfluencies and discuss briefly what the confidence intervals suggest concerning the different frequencies of the disfluency markers.
table(FILLER) # frequency table of FILLER
prop.table(table(FILLER)) # proportion table of FILLER
binom.test(sum(FILLER=="uh"), # how many fillers are "uh"
length(FILLER))$conf.int # out of all fillers -- what's the 95%-CI?
binom.test(sum(FILLER=="uhm"), # how many fillers are "uhm"
length(FILLER))$conf.int # out of all fillers -- what's the 95%-CI?
binom.test(sum(FILLER=="silence"), # how many fillers are "silence"
length(FILLER))$conf.int # out of all fillers -- what's the 95%-CI?
# The confidence intervals do not overlap so the proportions are significantly different.
# the much better version (that you weren't expected to know about):
lapply(table(FILLER), # apply to each value of table(FILLER)
binom.test, # the function prop.test
n=length(FILLER)) # use length(FILLER) as the argument to prop.test that's called n (the 2nd)
# the bootstrapping version for "silence" (that you weren't expected to be able to do):
collector <- rep(NA, 2000)
set.seed(1); for (i in 1:2000) {
FILLER.sampled <- sample(FILLER, 1000, replace=TRUE)
collector[i] <- sum(FILLER.sampled=="silence")
}
quantile(collector/1000, probs=c(0.025, 0.975))
# (6) Determine how many disfluencies occurred in each genre.
table(GENRE)
# (7) Represent in a graph how many disfluencies occurred in each genre.
dotchart(
main="Frequencies of grades in a test", # the main heading
xlab="Observed frequency", # x-axis label
xlim=c(0, 1000), # x-axis limits: c(min, max)
x=table(GENRE), # what's to be plotted
ylab="Genres", # y-axis label
pch=16) # point character: 16 = filled circle
grid() # add a grid to the plot
(qwe <- barplot( # make qwe the result of creating a bar plot
table(GENRE))) # of the frequencies of GENRE
text( # plot text at
qwe, # these x-axis coordinates: middles of 6 bars
table(GENRE)/2, # these y-axis coordinates: half the bar heights
table(GENRE)) # namely this text: the frequencies of grades.in.test / bar heights
# (8) Was the 990th disfluency produced by a man or a woman?
SEX[990]
# (9) Determine how many disfluencies are longer than average.
length( # how many cases are there where
which(LENGTH> # the value of LENGTH is greater than
mean(LENGTH)) # the mean of LENGTH
) # end of length() counting positions
# alternative
sum(LENGTH> # how many cases are there where LENGTH is greater than
mean(LENGTH)) # the mean of length (summing logical values)
# alternative (less ideal)
table( # how many cases are there of
sign( # the signs
LENGTH-mean(LENGTH) # of each values difference to the mean
)) # end of sign() and length() # TMI
# alternative (less ideal)
table(LENGTH>mean(LENGTH)) # TMI