-
Notifications
You must be signed in to change notification settings - Fork 2
/
health_by_repo.py
156 lines (123 loc) · 6.73 KB
/
health_by_repo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Copyright Dawn M. Foster <[email protected]>
# MIT License
""" Starter Project Health Metrics Model data gathered per repository
This script uses data stored in an Augur PostgresQL database to gather
data for the metrics found in the Started Project Health Metrics Model:
https://chaoss.community/kb/metrics-model-starter-project-health/
Partial month data will never be reported. The last data reported will
be the most recent completed month.
If there are too few PRs, some charts will not be generated because
the charts only make sense with enough data points.
If a GitHub organization and repository are both specified, data will
be gathered on that single repository only.
If only a GitHub organization is specified, it will gather data about
every repository from that organization.
Requirements
------------
Files required to run this script:
config.json
{
"connection_string": "sqlite:///:memory:",
"database": "xxxxx",
"host": "xxxx.xxxx.xx",
"password": "xxxxx",
"port": xxxx,
"schema": "augur_data",
"user": "xxxx",
"user_type": "read_only"
}
Replace the 'x's with values to connect to your Augur database
Usage
-----
usage: health_by_repo.py [-h] -o ORG_NAME [-r REPO_NAME] [-y YEARS] [-b BUS_DAYS] -c AUGUR_CONFIG
-h, --help show this help message and exit
-o ORG_NAME, --org ORG_NAME
The name of the GitHub organization for data collection on your repo(s) (required)
-r REPO_NAME, --repo REPO_NAME
The name of a GitHub repository in that org where your PRs can be found. If no repo is specified, data will be
collected for all repos from the given org.
-y YEARS, --years YEARS
The number of years of data to collect (default to 1)
-b BUS_DAYS, --businessdays BUS_DAYS
The number of business days to use in the time to first response calculation (default to 2)
-c AUGUR_CONFIG, --configfile AUGUR_CONFIG
The full file path to an Augur config.json file (required)
Output
------
* Messages are printed to the screen for each data gathering step for each repo
* Graphs are stored as png files in subdirectories of an "output" folder named like
output/YYYY-MM/org_name/repo_name
"""
import argparse
import sys
import pandas as pd
import datetime
import calendar
from utils.augur_connect import augur_db_connect
from utils.date_calcs import get_dates
from utils.repo_info import get_repo_info, fork_archive, get_org_repos
from utils.file_operations import create_path_str
from metrics.release_frequency import activity_release_graph
from metrics.closure_ratio import sustain_prs_by_repo_graph
from metrics.first_response import response_time_graph
from metrics.bus_factor import contributor_risk_graph
# Gather options from command line arguments and store them in variables
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--org", required=True, dest = "org_name", help="The name of the GitHub organization for data collection on your repo(s) (required)")
parser.add_argument("-r", "--repo", required=False, dest = "repo_name", default=None, help="The name of a GitHub repository in that org where your PRs can be found. If no repo is specified, data will be collected for all repos from the given org.")
parser.add_argument("-y", "--years", required=False, dest = "years", type=int, default=1, help="The number of years of data to collect (default to 1)")
parser.add_argument("-b", "--businessdays", required=False, dest = "bus_days", type=int, default=2, help="The number of business days to use in the time to first response calculation (default to 2)")
parser.add_argument("-c", "--configfile", required=True, dest = "augur_config", help="The full file path to an Augur config.json file (required)")
args = parser.parse_args()
org_name = args.org_name
repo_name = args.repo_name
years = args.years
bus_days = args.bus_days
augur_config = args.augur_config
# Print parameters to the screen
print('Parameters: Years =', years, 'Business Days', bus_days)
# Get the dates for the analysis using the years argument if provided
days = 365 + calendar.isleap(year=datetime.datetime.now().year)
start_date, end_date = get_dates(days)
# Create the connection to the Augur database
engine = augur_db_connect(augur_config)
if repo_name == None:
# This is the case where data is gathered on all repos from an org
repoDF = get_org_repos(org_name, engine)
print("multiple repos")
# When gathering data on an org, it can be helpful to have a summary CSV
path = create_path_str(org_name)
output_filename = path + '/_' + org_name + '_output_yr_' + str(years) + '_bdays_' + str(bus_days) + '.csv'
try:
csv_output = open(output_filename, 'w')
csv_output.write('org_name,repo_name,releases,first_resp_mos,closure_ratio_mos,bus_factor,bus_factor_percents,fork,archive\n')
except:
print('Could not write to csv file. Exiting')
sys.exit(1)
else:
# This is the case where data is gathered on a single org / repo combo
repo_id = get_repo_info(engine, org_name, repo_name)
repoDF = pd.DataFrame([[repo_id, repo_name]], columns=['repo_id', 'repo_name'])
# Collect data for every repo in repoDF
for repo in repoDF.iterrows():
repo_id = repo[1]['repo_id']
repo_name = repo[1]['repo_name']
# Check to see if the repo is Forked or Archived, since those impact
# how you might interpret this data and print them to the screen
# In general, this model isn't intended to be used with forked
# or archived repos.
is_forked, is_archived = fork_archive(repo_name, org_name, engine)
print(org_name, repo_name, '- Forked:', str(is_forked), 'Archived:', str(is_archived))
# This section collects all of the data using the functions for each graph
# found in common_functions.py and creates the graphs for each metric
# Skips archived repos
if is_archived == False:
releases = activity_release_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years)
closure_ratio_mos = sustain_prs_by_repo_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years)
bus_factor, bus_factor_percents = contributor_risk_graph(repo_id, repo_name, org_name, start_date, end_date, engine, years)
first_resp_mos = response_time_graph(repo_id, repo_name, org_name, start_date, end_date, engine, bus_days, years)
if len(repoDF) > 1:
csv_line = org_name + ',' + repo_name + ',' + releases + ',' + first_resp_mos + ',' + closure_ratio_mos + ',' + bus_factor + ',' + bus_factor_percents + ',' + str(is_forked) + ',' + str(is_archived) + '\n'
csv_output.write(csv_line)
# Print a separator between repos
print('-------------')