forked from dgopstein/atom-finder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
code_age.R
173 lines (127 loc) · 8.11 KB
/
code_age.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
library(data.table)
library(ggplot2)
library(RColorBrewer)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
source("util.R")
source("project-age.R")
code.age.wide <- data.table(read.csv("data/code-age_all_2018-08-30_parse-source-args.csv"))
code.age.wide[, date := as.Date(date)]
code.age.wide <- code.age.wide[date > '1984-01-01' & date < '2018-01-01' & all.nodes > 1000]
code.age.wide <- code.age.wide[order(date)]
code.age.wide[project=='linux-historical']$project <- 'linux'
# Remove all emacs commits because of the K&R style code that breaks all pre-2010 analyses
# https://github.com/emacs-mirror/emacs/commit/971de7fb158335fbda39525feb2d7776a26bc030
# code.age.wide[project=='emacs' & date >= '2011-01-01', .(project, date, (all.nodes - non.atoms)/all.nodes)]
# code.age.wide <- code.age.wide[!(project=='emacs' & date < '2011-01-01'),]
# code.age.all.atoms[project=='emacs' & date < '2011-01-01', count := as.integer(2.18*count)]
# code.age.wide <- code.age.wide[project!='emacs']
code.age <- melt(code.age.wide, id.vars=c("project", "date","rev.str", 'all.nodes'), variable.name="atom", value.name="count")
code.age.all.atoms <- code.age[!atom%in%c('non.atoms')][, .(count = sum(count), all.nodes = mean(all.nodes)), by=c('project', 'date', 'rev.str')]
code.age.all.atoms[, rate := count / all.nodes]
code.age.all.atoms[, smooth.rate := zoo::rollmedian(rate, 5), by=project]
first.points <- code.age.all.atoms[, .(date=min(date)), by=project]
#code.age.all.atoms[date%in%first.points$date]$smooth.rate <- code.age.all.atoms[date%in%first.points$date]$rate
first.data <- code.age.all.atoms[,.SD[which.min(.SD$date)],by=project]
ggplot(code.age.all.atoms) +
theme_classic() +
geom_line(aes(date, rate, group=project, colour=project), size=0.5) +
geom_point(aes(date, rate, color=project), data=first.data, size=5) +
geom_text(aes(date, rate, label=paste(" ", project)), data=first.data, hjust=0, angle=0) +
scale_color_manual(values = colorRampPalette(brewer.pal(9, "Set1"))(14))
# Rolling stddev - https://rviews.rstudio.com/2017/07/18/introduction-to-rolling-volatility/
code.age.all.atoms[, .(date, sd = rollapply(rate, 5, sd)), by=project]
ggplot(code.age.all.atoms[, .(date, sd = rollapply(rate, 3, sd)), by=project]) +
geom_line(aes(date, sd, group=project))
# mean atom rate by project start-date
project.age.mean.atoms <- merge(project.age[, .(project, domain, date)],
code.age.all.atoms[, .(project, count, all.nodes)][
, lapply(.SD, function(x) base::sum(as.numeric(x))), by=project][
,.(project, rate=count/all.nodes)])
project.age.mean.atoms
rlm.model <- MASS::rlm(rate ~ date, data=project.age.mean.atoms)
rlm.lm.model <- summary(lm(rate ~ date, data=project.age.mean.atoms,weights=rlm.model$w) )
pearson.corr <- -sqrt(rlm.lm.model$adj.r.squared)
slope <- coef(rlm.lm.model)
slope['date','Estimate'] * 365
offset.projects <- c("clang", "emacs", "freebsd", "gcc", "gecko-dev", "git", "httpd", "linux", "mongo", "mysql-server", "nginx", "subversion", "vim", "webkit")
offset.x <- c(0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2)
offset.y <- c(0, 5, 0, 0, -3, 0, 0, 0, 0, -3, 0, 2, 0, 3)
#offset.x <- offset.y <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
project.age.mean.atoms.offsets <- merge(project.age.mean.atoms, data.frame(offset.projects, offset.x, offset.y), by.x="project", by.y="offset.projects")
mean.atoms.by.project.age <-
ggplot(project.age.mean.atoms.offsets, aes(x=date, y=rate)) +
theme_classic() +
#geom_smooth(method="rlm", colour="black", size=0.5, se=FALSE, fullrange=TRUE) +
stat_smooth(colour=colors2dark[1], size=1, se=FALSE, fullrange=TRUE,
method=function(f,data=data,weights=weight) MASS::rlm(f,data,weights=weight,method="MM")) +
geom_point(size=3, color=colors2dark[2]) +
geom_text(aes(label=paste(" ", project), x=date+20*offset.x, y=rate+.0001*offset.y), size = 3, angle=-17, hjust=0, vjust=0.4) +
#geom_text_repel(aes(label=project), size = 4, angle=-20, force=0.1, direction="x") +
scale_x_date(limits = as.Date(c("1985-01-01", "2017-01-01"))) +
scale_y_continuous(limits = c(.00, .026)) +
scale_colour_manual(values = domain.colors) +
annotate("text", x=as.Date('2002-01-01'), y=0.019, label="r==-0.65", parse=TRUE, hjust=0.0, size=4.0) +
theme(legend.position = c(0.9, 0.6)) +
labs(x = "Project Start Date", y = "Average Atom Rate", colour="Domain")
mean.atoms.by.project.age
ggsave("img/mean_atoms_by_project_age.pdf", mean.atoms.by.project.age, width=(width<-145), height=width*0.5, units = "mm")
mean.project.date.atoms <- merge(project.age.mean.atoms[,c('project', 'rate')],
code.age.all.atoms[, .(date = mean(date)), by=project])
mean.project.date.atoms <- merge(project.age.mean.atoms[,c('project', 'rate')],
code.age.all.atoms[, .(date = mean(date)), by=project])
project.age.linear.models <-
data.table(project = first.points$project,
model = lapply(first.points$project,
function(proj) lm(rate ~ date, code.age.all.atoms[project==proj])),
date = first.points$date)
# projects estimated by lines
first.points$linear.rate <-
sapply(first.points$project,
function(proj)
predict(lm(rate ~ date, code.age.all.atoms[project==proj]),
first.points[project==proj, .(date)]))
first.points$angle <-
sapply(first.points$project,
function(proj)
lm(rate ~ date, code.age.all.atoms[project==proj])$coefficients)["date",]
code.age.for.regression <- merge(code.age.all.atoms, first.points[,.(project, linear.rate)], by.x = 'project', by.y='project')
summary(lm(rate ~ date, data=code.age.all.atoms[project=='emacs' & date >= '2011-01-01']))
summary(lm(rate ~ date, data=code.age.all.atoms[project=='emacs' & date >= '2011-01-01']))
ggplot(aes(date, rate), data=code.age.all.atoms[project=='emacs' & date >= '2011-01-01']) +
geom_point() + stat_smooth(aes(date, rate, group=project, color=project), method="lm", size=0.5, se=FALSE)
first.points$angle.no.intercept <-
sapply(first.points$project,
function(proj)
lm((I(rate - linear.rate) ~ 0+date), code.age.for.regression[project==proj])$coefficients)
project.age.linear.no.intercept <- ggplot(first.points) +
theme_classic() +
geom_segment(aes(x = date, y=linear.rate, xend=as.Date("2017-01-01"), yend=(3000*angle.no.intercept)+linear.rate, color=project), size=0.5) +
geom_point(aes(date, linear.rate, color=project), data=first.points, size=2) +
geom_text(aes(date, linear.rate, label=paste0(" ", project), angle=(1.3*(10^7)*angle)),
data=first.points, hjust=0, vjust=-0.4) +
labs(x = "Date", y = "Linearized Atom Rate") +
guides(colour=FALSE)
project.age.linear.no.intercept
project.age.linear <- ggplot(code.age.all.atoms[project!='emacs']) +
theme_classic() +
stat_smooth(aes(date, rate), method="lm", colour="gray", size=2, se=FALSE, fullrange=TRUE) +
stat_smooth(aes(date, rate, group=project), color=colors2dark[1], method="lm", size=0.5, se=FALSE) +
annotate("text", x=as.Date('1991-02-01'), y=0.017, angle=-4.2, label=" All Projects", hjust=0.0, size=4) +
annotate("point", x=as.Date('1991-02-01'), y=0.01605, colour='#888888', size=4.0) +
geom_point(aes(date, linear.rate), color=colors2dark[2], data=first.points[project!='emacs'], size=3) +
geom_text(aes(date, linear.rate, label=paste0(" ", project), angle=(1.3*(10^7)*angle)),
data=first.points[project!='emacs'], hjust=0, vjust=-0.4, size=3.0) +
#scale_y_continuous(limits = c(.00, .026)) +
labs(x = "Date", y = "Linearized Atom Rate") +
guides(colour=FALSE)
project.age.linear
ggsave("img/project_age_linear.pdf", project.age.linear, width=(width<-145), height=width*0.5, units = "mm")
# projects estimated by curves
ggplot(code.age.all.atoms) +
geom_point(aes(date, rate, color=project), data=first.data, size=5) +
geom_text(aes(date, rate, label=paste(" ", project)), data=first.data, hjust=0, angle=0) +
stat_smooth(aes(date, rate, group=project, color=project), size=0.5, se=FALSE) +
stat_smooth(aes(date, rate), method="lm", colour="red", size=2, se=FALSE, fullrange=TRUE)
Hmisc::binconf(20, 20)
Hmisc::binconf(19, 20)
Hmisc::binconf(12, 20)