### TELL R WHERE TO LOOK FOR FILES (CUSTOMIZE AS NEEDED) ###
setwd("C:/Documents and Settings/dan/Desktop/phonlab")

### OLS REGRESSION: CONSONANTS VS VOWELS ###
### read in the data, and take a look at the first few rows to get a feel for it.
phoible <- read.delim("phoibleExampleData.txt", header=T, sep="\t")
head(phoible)

### plot number of consonants vs number of vowels for each language.
### Since our data has >900 languages, it's quite likely to have some overlapping data points,
### so we'll make them semi-transparent using the col=rgb() argument.  
### That way, overlapping data points will be evident because they'll show up as darker.
plot(phoible$con, phoible$vow, type="p", pch=16, cex=1.25, col=rgb(0,100,0,50,maxColorValue=255), xlab="consonants", ylab="vowels", main="Vowel-to-Consonant Correlation(?)")

### create an OLS linear model predicting number of vowels from number of consonants,
### and draw the best-fit line on the graph
lmFit.vc <- lm(phoible$vow ~ phoible$con)
abline(lmFit.vc, col="red")

### show the statistical summary of the fitted model, and extract a few of the key numbers for use in the graph legend.
summary(lmFit.vc)
beta <- round(summary(lmFit.vc)$coefficients[2,1], digits=4)
rsqu <- round(summary(lmFit.vc)$r.squared, digits=4)
pval <- round(summary(lmFit.vc)$coefficients[2,4], digits=8)
legend("topleft", legend=paste("\u03B2 =", beta, "\n R\u00B2 =", rsqu, "\n p =", pval), bty="n")

### note: \u03B2 is the glyph for Greek letter beta, \u00B2 is the superscript 2
#####################################


### ANOVA: /u/ FRONTING #############
### read in the data, and take a look at the first few rows to get a feel for it.
regVow <- read.delim("uBackingExampleData.txt", header=T, sep="\t")
head(regVow)

### for this example, let's look only at the /u/ vowel
u <- subset(regVow, regVow$arpabet=="uw")

### and for ease of plotting, we'll separate out the three studies
u.pb <- subset(u, u$study == "PB")
u.pn <- subset(u, u$study == "PNW")
u.hg <- subset(u, u$study == "HGCW")

### now, we'll plot all three studies on the same graph, but with different colors
par(mfcol=c(1,1), mar=c(1,1,6.5,4.5))
plot(u.pb$F2, u.pb$F1, type="p", pch=16, col="red", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200))
par(new=T)
plot(u.pn$F2, u.pn$F1, type="p", pch=16, col="blue", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200))
par(new=T)
plot(u.hg$F2, u.hg$F1, type="p", pch=16, col="green", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200))

### ...and add axes, title, and legend
axis(3, at=600+c(0:12)*100, las=0, col.axis="black", cex.axis=0.8, tck=-.01)
axis(4, at=200+c(0:4)*100, las=2, col.axis="black", cex.axis=0.8, tck=-.01)
mtext("/u/ Backing in PNW English", side=3, cex=1.2, las=1, line=4.5, font=2)
mtext("F2 (Hz)", side=3, cex=1, las=1, line=2.5, font=2)
mtext("F1 (Hz)", side=4, cex=1, las=3, line=2.5, font=2)
legend("topleft", legend=c("PB","PNW","HGCW"), pch=c(16,16,16), col=c("red","blue","green"), inset=0.02)

### but we still don't know if the differences seen in the graph are significant, so we run a factorial ANOVA and look at its results.
anova.u <- aov(F2 ~ study*gender, data=u)
summary(anova.u)

#####################################


### LINEAR MIXED MODELS: SPEAKER POPULATION VS NUMBER OF PHONEMES #
### here we're reusing the same phoible data from the OLS example.
### if you skipped that example, read in the data at this point:
phoible <- read.delim("phoibleExampleData.txt", header=T, sep="\t")

### this time we're plotting number of speakers vs number of phonemes
plot(phoible$pop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16)
hist(phoible$pop, breaks=seq(0,1000000000,1000000), main="Histogram of Speaker Population Sizes", sub="bin size: one million", col="blue")

### the two plots above should have convinced you that the data is NOT normally distributed...
### ...so the next two plots remedy that by using log-transformed population numbers instead of raw numbers.
plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16)
hist(phoible$logPop, breaks=seq(0,25,1), main="Histogram of Speaker Population Sizes", sub="bin size: one log unit", col="blue")

### we can do an OLS fit like in the first example...
lmFit.lp <- lm(phoible$pho ~ phoible$logPop)
abline(lmFit.lp, col="red")

### ...and look at its statistical summary, extracting the relevant numbers for our graph.
summary(lmFit.lp)
beta <- round(summary(lmFit.lp)$coefficients[2,1], digits=4)
rsqu <- round(summary(lmFit.lp)$r.squared, digits=4)
pval <- round(summary(lmFit.lp)$coefficients[2,4], digits=16)
legend("topleft", legend=paste("\u03B2 =", beta, "\n R\u00B2 =", rsqu, "\n p =", pval), bty="n")

### But our data violates an assumption of OLS regression, which is that the data are INDEPENDENT.
### On the contrary, our data points are systematically related to each other in virtue of being data
### about languages grouped into language families.  This is typically called "Nested" data.
### To address this, we will fit separate regression lines for each family, using the LMER function in the LME4 package.
library(lme4)

### This first model assumes that the relationship between population and phoneme inventory is the same for all languages/families...
### ...in other words, it assumes that the regression lines for each family all have the same slope, but varying intercepts.
mixMod.lp <- lmer(pho~logPop+(1|fam), data=phoible)
plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16)
grandIntercept <- unlist(fixef(mixMod.lp)[1])
grandSlope <- unlist(fixef(mixMod.lp)[2])
randomIntercepts <- as.vector(unlist(ranef(mixMod.lp)))
coefficientsByGroup <- cbind(grandIntercept+randomIntercepts,grandSlope)
for (i in c(1:nrow(coefficientsByGroup))) abline(coefficientsByGroup[i,1], coefficientsByGroup[i,2], col=rgb(255,0,0,40,maxColorValue=255))

### This second model also allows the slope to vary across families.
mixMod.lp.rs <- lmer(pho~logPop+(1+logPop|fam), data=phoible)
plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16)
grandIntercept <- unlist(fixef(mixMod.lp.rs)[1])
grandSlope <- unlist(fixef(mixMod.lp.rs)[2])
randomIntercepts <- data.frame(ranef(mixMod.lp.rs)[1])[,1]
randomSlopes <- data.frame(ranef(mixMod.lp.rs)[1])[,2]
coefficientsByGroup <- cbind(grandIntercept+randomIntercepts,grandSlope+randomSlopes)
for (i in c(1:nrow(coefficientsByGroup))) abline(coefficientsByGroup[i,1], coefficientsByGroup[i,2], col=rgb(255,0,0,40,maxColorValue=255))

### So which one is a "better" model? And are either of them "significant"?
### compare outputs:
summary(mixMod.lp)
summary(mixMod.lp.rs)

### Notice there are no "p values."  Interpretation of mixed model results is very tricky.  If you are going to use this in your own research,
### see the articles and books referenced below, or take a class on Heirarchical Linear Modelling.  But for now, look at one particular feature
### of the output:  the "correlation" column of the Random Effects table.  This tells you that intercept and slope are highly correlated, and therefore
### you might conclude that allowing random variation of slopes is not providing much additional predictive power.  One further point is that for large
### datasets like this one, the t-distribution closely approximates the p-distribution, so you can make a "ballpark" judgment of significance based on
### the t-values reported in the summary. In the case of the mixMod.lp model, we see that "family" has a t-value up around 20 standard deviations, whereas 
### log(population) has a t-value around zero.  This means that family is a highly significant predictor of phoneme inventory size, and that speaker population
### is NOT a significant predictor once we take family into account.

### REFERENCES ###
# Gelman, A. & J. Hill (2007). Data Analysis Using Regression and Multilevel/Hierarchical Models. New York: CambridgeUP.

# Baayen, R.H. (2008). Analysing Linguistic Data. New York: CambridgeUP.  (see esp. chapter 7).

# Baayen, R.H., D.J. Davidson, & D.M. Bates (2008). "Mixed-effects modeling with crossed random effects for subjects and items." Journal of Memory and Language 59: 390–412

# Jaeger, T.F. (2008). "Categorical data analysis: Away from ANOVAs (transformation or not) and towards logit mixed models." Journal of Memory and Language 59: 434–446.

# http://www.statmethods.net/

# http://www.r-tutor.com/