This page shows the methods and data I've used to create figures uploaded to wikimedia commons for use in articles. The idea is to make it as easy as possible for anyone else to replace the figures I've made with better ones! The code provided is in the R programming language, and I recommend the great open source IDE RStudio for playing about with it. Let me know on my talk page if you have any questions or criticisms.

United States employment statistics

[edit]

Data

[edit]

Two data sets offered by the US gov (.xls excel spreadsheets):

NB. These seem to be kept reasonably up to date, so these plots can be updated as new data becomes available.

Code

[edit]
R code
require("zoo")
# first save xls as csv, then...
# unemployment
u <- read.csv("Unemployment.csv", header=T)
u <- melt(u, "Year")

unemployment <- data.frame(date=as.yearmon(do.call(paste, u[,1:2]), "%Y %b"),
                           rate=u$value)
unemployment <- unemployment[unemployment$date > as.yearmon("2008-12"),]
u2 <- unemployment
colnames(u2) <- c("date", "rate")

# net change in jobs
n <- read.csv("netChange.csv", header=T)
n <- melt(n, "Year")

# Check (!) dates are the same in each input, else repeat parse 
all.equal(net.change$date, unemployment$date) # TRUE
net.change <- data.frame(date=unemployment$date,
                         change=n$value)
net.change <- net.change[net.change$date > as.yearmon("2008-12"),]

dev.off()
svg("new.svg", 7.5, 5)
par(mar=c(5.5,4.5,5,3.8), mgp=c(1.8,.7,0))
# sort by date
net.change <- net.change[order(net.change$date),]
# original starts at 09
net.change <- net.change[net.change$date > as.yearmon("2008-12"),]

#par(yaxs="i")
bpos <- barplot(net.change$change, plot=F)
x <- as.Date(net.change$date)
# scale will be integer (== month)
plot(1:length(x), net.change$change, type="n", ylim=c(-1000, 600),
     frame=F, axes=F, xlab="", ylab="", xlim=c(2.7, length(x)-1.7))
rect(xleft=1:length(x)-.3, xright=1:length(x)+.3, 
     ybottom=0, ytop=net.change$change, col="#4F81BD", lend=2)
abline(h=0, lwd=1)

## Awful code, avert your eyes
labs <- format(as.Date(as.character(cut(as.Date(seq(x[1], x[length(x)], length.out=10), 
              format= "%M %Y"), breaks="months"))), "%b %Y")

axis(1, at=seq(1, length(x), length.out=length(labs)), 
     labels=labs, las=3)
axis(2, at=seq(-1000, 600, by=200), las=1)
mtext("Net change in employment per month (1000s of jobs)", 
      side=2, col="#1F497D", line=3)

## now unemployment
unemployment <- unemployment[order(unemployment$date),]
unemployment <- unemployment[unemployment$date > as.yearmon("2008-12"),]
par(new=T)
# set up same plot
plot(1:length(x), unemployment$rate, type="n", ylim=c(6.5, 10),
     frame=F, axes=F, xlab="", ylab="", xlim=c(2.7, length(x)-1.7))
lines(1:length(x), unemployment$rate, col="#C0504D", lwd=3.5)
axis(4, las=1)

text(length(x)*1.1, (par("usr")[4] + par("usr")[3])/2, "Percent unemployed", 
     srt = 270, xpd = TRUE, col="#C0504D")
text(length(x)*.85, ((par("usr")[4] + par("usr")[3])/4) *1.65, "Unemployment rate", 
     xpd = TRUE, col="#C0504D")
text(length(x)*.215, ((par("usr")[4] + par("usr")[3])/4) *1.75, "Jobs created or lost", 
     xpd = TRUE, col="#4F81BD")

mtext("United States Employment Statistics\n Jan 2009 - Dec 2013", side=3, 
      col="#1F497D", cex=1.4, line=2, font=2)
mtext("Monthly change, seasonally adjusted", side=3, cex=1.1, line=1)
dev.off()

Figure

[edit]
Original (Raster) New (Vector)


Cost of genome sequencing

[edit]

Data

[edit]

Table from NHGRI, should be updated "frequently".

Code

[edit]
R code
library(zoo)
library(scales)

# from http://www.genome.gov/pages/der/sequencing_cost_apr13.xls
seq <- read.table("<filename.csv>", stringsAsFactors=F, header=T)
seq$CostPerMb <- NULL

seq <- melt(seq, 1)
seq$Date <- as.yearmon(seq$Date, format="%b-%y")

options(scipen=99)
svg("figs/genome_cost.svg", 5, 4.5)
ggplot(seq, aes(x=as.Date(Date), y=value)) +
  labs(y="Cost per genome (USD)", x="", col="") +
  geom_hline(yintercept=1000, linetype="dashed", 
             col=I(muted("red"))) + 
  geom_line(size=1.5, col=I(muted("blue"))) + 
  scale_y_log10(limits=c(900,100000000),
                breaks=c(1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
                labels=comma) +
  ggtitle("Genome sequencing cost as estimated by NHGRI\n
          (September 2001 to April 2013)") +
  theme(plot.title=element_text(size=12))
dev.off()

Figure

[edit]

Comments

[edit]

The source code is no longer accurate, as I updated and redid the data using gnuplot. (It seemed easier for simple cases. Now that I'm learning R, that might not be the case further on, but regardless, the code on the image page doesn't match what's above. grendel|khan 05:51, 27 July 2016 (UTC)

History (and predicted future) size of the Sequence Read Archive

[edit]

Data

[edit]

A .csv containing the growth information up to (an extrapolated) month in late 2013 can be downloaded here from the NCBI website. The sequencing platform data is hardcoded below but was retreived from this publication.

Code

[edit]
R code
# R (2.15.1)
library(plotrix)
library(sfsmisc)
library(TeachingDemos)

# Pie chart data
slices <- c(84, 12, 2, 2) 
lbls <- c("Illumina GA", "SOLiD", "454", "Other")

# Suppress scientific notation
options(scipen=5)

###### Draw plot ###### 

svg("~/SRA_growth.svg", 6,5)
par(mar=c(2,4,2,2))
sra_stat <- read.csv("~/<DATA_LOCATION>/sra_stat.csv")
sra_stat[,1] <- as.Date(sra_stat[,1], format="%m/%d/%Y")
sra_stat <- sra_stat[2:nrow(sra_stat),]

# Line plot
plot(sra_stat[,1], sra_stat[,2], xlab="", ylab="Number of Bases", 
     type="l", log="y", col="darkgreen", lwd=2, yaxt="n")
points(sra_stat[,1], sra_stat[,3], type="l", col="lightgreen", lwd=2)
axis(2,at=axTicks(2),label=axTexpr(2))
legend("topleft", fill=c("darkgreen", "lightgreen"), 
       legend=c("All Bases", "Open Access"))

# Add pie chart inset

par(cex=0.8)

subplot(pie(slices, labels=lbls, main="", col = c(topo.colors(4))), 

        as.Date("2011/07/14"), 5E11, size=c(2.5,2.5))
dev.off()

Figure

[edit]

Comments

[edit]

EMBL-Bank Growth

[edit]

Data

[edit]

The size of EMBL-Bank in nucleotides is reported in each set of Release Notes. This graph was drawn from Release Notes for version 114 (currently available here as of Jan 2013). For use in the code below, the relevant table was copied from the report and pasted to a file (EMBLBankGrowth below).

Code

[edit]
R code
# R (2.15.1)
library(zoo)
library(sfsmisc)
svg("~/other/EMBLBank_growth3.svg", 6, 6)
options(scipen=3)
par(mar=c(2.5,4.1,2.5,2.1), xaxs="r", lend=2)

# Load and prepare data
ebg <- as.data.frame(read.table("~/<DATA-LOCATION>/EMBLBankGrowth", 
    header=T, quote="\"", colClasses=c("numeric", "character","numeric", "numeric")))
ebg[,2] <- as.yearmon(gsub("/", "-", ebg[,2]), "%m-%Y")
ebg <- as.data.frame(read.table("~/Desktop/EMBLBankGrowth", header=T, quote="\"",

# Set up plot 
plot(ebg[,2], ebg[,4], type="l", log="y", col="blue", lwd=2, xlab="", 
     ylab="Number of Entries or Nucleotides", yaxt="n", xaxt="n", 
     ylim=c(min(ebg[,3]), 1E12), xlim=c(1982,2013), main="Growth of EMBL-Bank (1982-2012)")
axis(2,at=axTicks(2),label=axTexpr(2))
axis(1, at=seq(1980,2015,5))
grid(lty=1)

# Add line for entries
points(ebg[,2], ebg[,3], type="l", log="y", col="red", lwd=2)
legend("topleft", fill=c("blue","red"), legend=c("Nucleotides", "Entries"), cex=0.8)
dev.off()

Figure

[edit]

Comments

[edit]

Wikipedia Editor numbers analysis

[edit]

Data

[edit]

Data retrieved from stats.wikimedia.org. Copy and paste table into a plaintext file to keep relative column numbering. Data used in the figure runs from January 2001 up to December 2012.

Code

[edit]
R code
library(zoo)
wiki <- read.delim("~/<DATA>", header=F)
par(mar=c(2,4.5,1,2))

svg("~/Desktop/wiki_stats.svg",7,7 )
wiki[,1] <- as.yearmon(wiki[,1],"%b %Y")

# Active (10 edits / month)
plot(wiki[,1], wiki[,4], type="l", xaxt="n", ylab="Number of editors", lwd=4,
     col="darkblue", xlab="")

# Really active (100 edits / month)
points(wiki[,1], wiki[,5], type="l", lwd=4, col="darkgreen")
axis(1, at=seq(2001,2012,1), cex.axis=0.8)
legend("topleft", c("10 edits / month", "100 edits / month"), 
       col=c("darkblue","darkgreen"), bty="n", lwd=4)

# Regression from 2007 onwards:
library(car)
recent  <- wiki[wiki[,1] > 2007,]
reg <- lm(recent[,4] ~ recent[,1])
regLine(reg, lwd=2)
text(2011,45000,paste("~", signif(abs(reg$coefficients[[2]]),2), 
     " fewer per year", sep=""), col="darkblue")
text(2011,43000,paste(signif(abs(reg$coefficients[[2]]) / max(recent[,4]) * 100,3), 
     "% of max.", sep=""), col="darkblue")

reg2 <- lm(recent[,5] ~ recent[,1])
regLine(reg2, lwd=2, col="black")
text(2011,10000,paste("~", signif(abs(reg2$coefficients[[2]]),2), 
     " fewer per year", sep=""), col="darkgreen")
text(2011,8000,paste(signif(abs(reg2$coefficients[[2]]) / max(recent[,5]) * 100,3), 
     "% of max.", sep=""), col="darkgreen")
dev.off()

Figure

[edit]

Comments

[edit]

RfA voting analysis (old data)

[edit]

Data

[edit]

Data found in this table from 2011, tab-separated version used in this script available at github.

Code

[edit]
R code
# relative path if sourced with master.R
a <- read.table("../data/wvotes.tsv", fill=T, stringsAsFactors=F, header=T)

# format messy string into date
a$Firstedit <- as.Date(a$Firstedit, "%B_%d,_%Y")

# Do support/oppose levels differ between admins and non-admins?
ggplot(a, aes(x=Opposes, y=Supports, col=Admin, group=Admin)) + 
 geom_jitter(width=4, height=4) + geom_smooth(method="lm", se=F) + theme_bw() 

# Filter users who haven't voted much
b <- a[a$Supports + a$Opposes + a$Neutrals > 10,]
library(gridExtra)
svg("../figs/RfAvote_analysis.svg",15, 5)
grid.arrange(
 # Do those with higher edit counts support more?
 ggplot(b, aes(x=Editcount+1, y=100*(Supports/(Supports+Opposes+Neutrals)), col=Admin)) + 
 geom_jitter() + scale_x_log10() + geom_smooth(method="lm", se=T, size=1.5) + 
 theme_bw() + xlab("Number of edits") + ylab("Support percentage (Users with >10 votes)"),
 # Do old-timers support more than newer editors?
 ggplot(b, aes(x=Firstedit+1, y=100*(Supports/(Supports+Opposes+Neutrals)), col=Admin)) + 
 geom_jitter() + geom_smooth(method="lm", se=T, size=1.5) + 
 theme_bw() + xlab("Date of first edit") + ylab("Support percentage (Users with >10 votes)"),
 # Do admins on the whole support more RfAs than editors?
 ggplot(b, aes(x=Admin, y=100*(Supports/(Supports+Opposes+Neutrals)), fill=Admin)) + 
 geom_violin(alpha=I(.5), notch=T) + xlab("Admin") + theme_bw() +
 ylab("Support percentage (Users with >10 votes)")
 , ncol=3)
dev.off()

##Let's do some stats, do admins support more or less than non-admins?
admin <- b[b$Admin == "Yes",]
editor <- b[b$Admin == "No",]

adminvotes <- admin$Supports / admin$RfAVotes
editorvotes <- editor$Supports / editor$RfAVotes
wilcox.test(adminvotes, editorvotes)
# W = 14184.5, p-value = 0.002663

wilcox.test(admin$Supports, editor$Supports)
# W = 12256, p-value = 0.5884

wilcox.test(admin$RfAvotes, editor$RfAvotes)
# W = 11318, p-value = 0.5117

wilcox.test(admin$Neutrals + admin$Opposes, editor$Neutrals + editor$Opposes)
# W = 9463, p-value = 0.002441

## Commentary:
"""
Admins !voters have a significantly higher support % than non-admins (p < 0.01), 
though they haven't actually !voted support more times than the average editor 
(mean supports, admin: 22.99, editor: 22.76) so possibly just !vote a little 
less often and give fewer oppose + neutral !votes 
(mean # oppose + neutrals, admin: 7.89, editor: 10.09,  p < 0.01).
"""

Figure

[edit]

Comments

[edit]