### TELL R WHERE TO LOOK FOR FILES (CUSTOMIZE AS NEEDED) ### setwd("C:/Documents and Settings/dan/Desktop/phonlab") ### OLS REGRESSION: CONSONANTS VS VOWELS ### ### read in the data, and take a look at the first few rows to get a feel for it. phoible <- read.delim("phoibleExampleData.txt", header=T, sep="\t") head(phoible) ### plot number of consonants vs number of vowels for each language. ### Since our data has >900 languages, it's quite likely to have some overlapping data points, ### so we'll make them semi-transparent using the col=rgb() argument. ### That way, overlapping data points will be evident because they'll show up as darker. plot(phoible$con, phoible$vow, type="p", pch=16, cex=1.25, col=rgb(0,100,0,50,maxColorValue=255), xlab="consonants", ylab="vowels", main="Vowel-to-Consonant Correlation(?)") ### create an OLS linear model predicting number of vowels from number of consonants, ### and draw the best-fit line on the graph lmFit.vc <- lm(phoible$vow ~ phoible$con) abline(lmFit.vc, col="red") ### show the statistical summary of the fitted model, and extract a few of the key numbers for use in the graph legend. summary(lmFit.vc) beta <- round(summary(lmFit.vc)$coefficients[2,1], digits=4) rsqu <- round(summary(lmFit.vc)$r.squared, digits=4) pval <- round(summary(lmFit.vc)$coefficients[2,4], digits=8) legend("topleft", legend=paste("\u03B2 =", beta, "\n R\u00B2 =", rsqu, "\n p =", pval), bty="n") ### note: \u03B2 is the glyph for Greek letter beta, \u00B2 is the superscript 2 ##################################### ### ANOVA: /u/ FRONTING ############# ### read in the data, and take a look at the first few rows to get a feel for it. regVow <- read.delim("uBackingExampleData.txt", header=T, sep="\t") head(regVow) ### for this example, let's look only at the /u/ vowel u <- subset(regVow, regVow$arpabet=="uw") ### and for ease of plotting, we'll separate out the three studies u.pb <- subset(u, u$study == "PB") u.pn <- subset(u, u$study == "PNW") u.hg <- subset(u, u$study == "HGCW") ### now, we'll plot all three studies on the same graph, but with different colors par(mfcol=c(1,1), mar=c(1,1,6.5,4.5)) plot(u.pb$F2, u.pb$F1, type="p", pch=16, col="red", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200)) par(new=T) plot(u.pn$F2, u.pn$F1, type="p", pch=16, col="blue", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200)) par(new=T) plot(u.hg$F2, u.hg$F1, type="p", pch=16, col="green", xaxt="n", yaxt="n", xlim=c(1800,600), ylim=c(600,200)) ### ...and add axes, title, and legend axis(3, at=600+c(0:12)*100, las=0, col.axis="black", cex.axis=0.8, tck=-.01) axis(4, at=200+c(0:4)*100, las=2, col.axis="black", cex.axis=0.8, tck=-.01) mtext("/u/ Backing in PNW English", side=3, cex=1.2, las=1, line=4.5, font=2) mtext("F2 (Hz)", side=3, cex=1, las=1, line=2.5, font=2) mtext("F1 (Hz)", side=4, cex=1, las=3, line=2.5, font=2) legend("topleft", legend=c("PB","PNW","HGCW"), pch=c(16,16,16), col=c("red","blue","green"), inset=0.02) ### but we still don't know if the differences seen in the graph are significant, so we run a factorial ANOVA and look at its results. anova.u <- aov(F2 ~ study*gender, data=u) summary(anova.u) ##################################### ### LINEAR MIXED MODELS: SPEAKER POPULATION VS NUMBER OF PHONEMES # ### here we're reusing the same phoible data from the OLS example. ### if you skipped that example, read in the data at this point: phoible <- read.delim("phoibleExampleData.txt", header=T, sep="\t") ### this time we're plotting number of speakers vs number of phonemes plot(phoible$pop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16) hist(phoible$pop, breaks=seq(0,1000000000,1000000), main="Histogram of Speaker Population Sizes", sub="bin size: one million", col="blue") ### the two plots above should have convinced you that the data is NOT normally distributed... ### ...so the next two plots remedy that by using log-transformed population numbers instead of raw numbers. plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16) hist(phoible$logPop, breaks=seq(0,25,1), main="Histogram of Speaker Population Sizes", sub="bin size: one log unit", col="blue") ### we can do an OLS fit like in the first example... lmFit.lp <- lm(phoible$pho ~ phoible$logPop) abline(lmFit.lp, col="red") ### ...and look at its statistical summary, extracting the relevant numbers for our graph. summary(lmFit.lp) beta <- round(summary(lmFit.lp)$coefficients[2,1], digits=4) rsqu <- round(summary(lmFit.lp)$r.squared, digits=4) pval <- round(summary(lmFit.lp)$coefficients[2,4], digits=16) legend("topleft", legend=paste("\u03B2 =", beta, "\n R\u00B2 =", rsqu, "\n p =", pval), bty="n") ### But our data violates an assumption of OLS regression, which is that the data are INDEPENDENT. ### On the contrary, our data points are systematically related to each other in virtue of being data ### about languages grouped into language families. This is typically called "Nested" data. ### To address this, we will fit separate regression lines for each family, using the LMER function in the LME4 package. library(lme4) ### This first model assumes that the relationship between population and phoneme inventory is the same for all languages/families... ### ...in other words, it assumes that the regression lines for each family all have the same slope, but varying intercepts. mixMod.lp <- lmer(pho~logPop+(1|fam), data=phoible) plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16) grandIntercept <- unlist(fixef(mixMod.lp)[1]) grandSlope <- unlist(fixef(mixMod.lp)[2]) randomIntercepts <- as.vector(unlist(ranef(mixMod.lp))) coefficientsByGroup <- cbind(grandIntercept+randomIntercepts,grandSlope) for (i in c(1:nrow(coefficientsByGroup))) abline(coefficientsByGroup[i,1], coefficientsByGroup[i,2], col=rgb(255,0,0,40,maxColorValue=255)) ### This second model also allows the slope to vary across families. mixMod.lp.rs <- lmer(pho~logPop+(1+logPop|fam), data=phoible) plot(phoible$logPop, phoible$pho, col=rgb(0,100,0,50,maxColorValue=255), pch=16) grandIntercept <- unlist(fixef(mixMod.lp.rs)[1]) grandSlope <- unlist(fixef(mixMod.lp.rs)[2]) randomIntercepts <- data.frame(ranef(mixMod.lp.rs)[1])[,1] randomSlopes <- data.frame(ranef(mixMod.lp.rs)[1])[,2] coefficientsByGroup <- cbind(grandIntercept+randomIntercepts,grandSlope+randomSlopes) for (i in c(1:nrow(coefficientsByGroup))) abline(coefficientsByGroup[i,1], coefficientsByGroup[i,2], col=rgb(255,0,0,40,maxColorValue=255)) ### So which one is a "better" model? And are either of them "significant"? ### compare outputs: summary(mixMod.lp) summary(mixMod.lp.rs) ### Notice there are no "p values." Interpretation of mixed model results is very tricky. If you are going to use this in your own research, ### see the articles and books referenced below, or take a class on Heirarchical Linear Modelling. But for now, look at one particular feature ### of the output: the "correlation" column of the Random Effects table. This tells you that intercept and slope are highly correlated, and therefore ### you might conclude that allowing random variation of slopes is not providing much additional predictive power. One further point is that for large ### datasets like this one, the t-distribution closely approximates the p-distribution, so you can make a "ballpark" judgment of significance based on ### the t-values reported in the summary. In the case of the mixMod.lp model, we see that "family" has a t-value up around 20 standard deviations, whereas ### log(population) has a t-value around zero. This means that family is a highly significant predictor of phoneme inventory size, and that speaker population ### is NOT a significant predictor once we take family into account. ### REFERENCES ### # Gelman, A. & J. Hill (2007). Data Analysis Using Regression and Multilevel/Hierarchical Models. New York: CambridgeUP. # Baayen, R.H. (2008). Analysing Linguistic Data. New York: CambridgeUP. (see esp. chapter 7). # Baayen, R.H., D.J. Davidson, & D.M. Bates (2008). "Mixed-effects modeling with crossed random effects for subjects and items." Journal of Memory and Language 59: 390–412 # Jaeger, T.F. (2008). "Categorical data analysis: Away from ANOVAs (transformation or not) and towards logit mixed models." Journal of Memory and Language 59: 434–446. # http://www.statmethods.net/ # http://www.r-tutor.com/