From b1af3f5f756095dc4ec6b3dc2da7b6affad60217 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 01/52] reformat using black --- croissance/__init__.py | 10 +- croissance/__main__.py | 105 +++++++++++------- croissance/estimation/__init__.py | 111 ++++++++++++------- croissance/estimation/defaults.py | 9 +- croissance/estimation/outliers.py | 15 ++- croissance/estimation/ranking.py | 19 +++- croissance/estimation/regression.py | 35 +++--- croissance/estimation/smoothing/__init__.py | 10 +- croissance/estimation/smoothing/segments.py | 18 +-- croissance/estimation/util.py | 25 +++-- croissance/figures/__init__.py | 115 +++++++++++++------- croissance/formats/input.py | 6 +- croissance/formats/output.py | 26 +++-- setup.py | 48 ++++---- test.py | 82 ++++++++------ 15 files changed, 393 insertions(+), 241 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index f535c77..a365d3c 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -1,15 +1,13 @@ from croissance.estimation import Estimator from collections import namedtuple -AnnotatedGrowthCurve = namedtuple('AnnotatedGrowthCurve', ('series', 'outliers', 'growth_phases')) +AnnotatedGrowthCurve = namedtuple( + "AnnotatedGrowthCurve", ("series", "outliers", "growth_phases") +) -def process_curve(curve: 'pandas.Series', **kwargs): +def process_curve(curve: "pandas.Series", **kwargs): estimator = Estimator(**kwargs) if curve.isnull().all(): return AnnotatedGrowthCurve(curve, [], []) return estimator.growth(curve) - - - - diff --git a/croissance/__main__.py b/croissance/__main__.py index 1bc9b20..fb66ca2 100644 --- a/croissance/__main__.py +++ b/croissance/__main__.py @@ -12,68 +12,86 @@ from croissance.formats.output import TSVWriter parser = argparse.ArgumentParser(description="Estimate growth rates in growth curves") -parser.add_argument('infiles', type=argparse.FileType('r'), nargs='+') -parser.add_argument('--input-format', type=str, default='tsv') -parser.add_argument('--input-time-unit', - choices=['hours', 'minutes'], - default='hours', - help="Time unit in time column") -parser.add_argument('--N0', - type=float, - default=0.0, - help="Baseline for constrained regressions & identifying curve segments in log space.") -parser.add_argument('--constrain-N0', - action='store_true', - help="All growth phases will begin at N0 when this flag is set.") -parser.add_argument('--segment-log-N0', - action='store_true', - help="Identify curve segments using log(N-N0) rather than N; " - "increases sensitivity to changes in exponential growth but has to assume a certain N0.") -parser.add_argument('--output-format', type=str, default='tsv') -parser.add_argument('--output-suffix', type=str, default='.output') -parser.add_argument('--output-exclude-default-phase', - type=bool, - default=False, - help="The output will not include a phase '0' for each curve when this flag is set.") -parser.add_argument('--figures', - action='store_true', - help='Renders a PDF file with figures for each curve.') +parser.add_argument("infiles", type=argparse.FileType("r"), nargs="+") +parser.add_argument("--input-format", type=str, default="tsv") +parser.add_argument( + "--input-time-unit", + choices=["hours", "minutes"], + default="hours", + help="Time unit in time column", +) +parser.add_argument( + "--N0", + type=float, + default=0.0, + help="Baseline for constrained regressions & identifying curve segments in log space.", +) +parser.add_argument( + "--constrain-N0", + action="store_true", + help="All growth phases will begin at N0 when this flag is set.", +) +parser.add_argument( + "--segment-log-N0", + action="store_true", + help="Identify curve segments using log(N-N0) rather than N; " + "increases sensitivity to changes in exponential growth but has to assume a certain N0.", +) +parser.add_argument("--output-format", type=str, default="tsv") +parser.add_argument("--output-suffix", type=str, default=".output") +parser.add_argument( + "--output-exclude-default-phase", + type=bool, + default=False, + help="The output will not include a phase '0' for each curve when this flag is set.", +) +parser.add_argument( + "--figures", + action="store_true", + help="Renders a PDF file with figures for each curve.", +) def main(): args = parser.parse_args() - if args.input_format.lower() != 'tsv': + if args.input_format.lower() != "tsv": raise NotImplementedError() - if args.output_format.lower() != 'tsv': + if args.output_format.lower() != "tsv": raise NotImplementedError() reader = TSVReader() - estimator = Estimator(constrain_n0=args.constrain_N0, - segment_log_n0=args.segment_log_N0, - n0=args.N0) + estimator = Estimator( + constrain_n0=args.constrain_N0, segment_log_n0=args.segment_log_N0, n0=args.N0 + ) try: empties = {} - for infile in tqdm(args.infiles, unit='infile'): - outfile = open('{}{}.tsv'.format(infile.name[:-4], args.output_suffix), 'w') + for infile in tqdm(args.infiles, unit="infile"): + outfile = open("{}{}.tsv".format(infile.name[:-4], args.output_suffix), "w") if args.figures: - figfile = open('{}{}.pdf'.format(infile.name[:-4], args.output_suffix), 'wb') + figfile = open( + "{}{}.pdf".format(infile.name[:-4], args.output_suffix), "wb" + ) figwriter = PDFWriter(figfile) - outwriter = TSVWriter(outfile, include_default_phase=not args.output_exclude_default_phase) + outwriter = TSVWriter( + outfile, include_default_phase=not args.output_exclude_default_phase + ) - for name, curve in tqdm(list(reader.read(infile)), unit='curve'): + for name, curve in tqdm(list(reader.read(infile)), unit="curve"): if curve.empty: try: empties[infile.name].append(name) except KeyError: empties[infile.name] = [name] continue - annotated_curve = estimator.growth(normalize_time_unit(curve, args.input_time_unit)) + annotated_curve = estimator.growth( + normalize_time_unit(curve, args.input_time_unit) + ) outwriter.write(name, annotated_curve) @@ -88,10 +106,19 @@ def main(): pass if empties != {}: - print('\nEmpty cells were found and discarded:\n', '\n'.join([(infile.name+'\t'+name) for key, names in empties.items() for name in names])) + print( + "\nEmpty cells were found and discarded:\n", + "\n".join( + [ + (infile.name + "\t" + name) + for key, names in empties.items() + for name in names + ] + ), + ) else: print() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index d54dd53..a132d29 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -5,17 +5,28 @@ import numpy import pandas -from croissance.estimation.defaults import MINIMUM_VALID_OD, RESAMPLE_POINTS_PER_HOUR, MINIMUM_VALID_SLOPE, \ - MINIMUM_PHASE_LENGTH, PHASE_MIN_DELTA_LOG_OD, CURVE_MINIMUM_DURATION_HOURS +from croissance.estimation.defaults import ( + MINIMUM_VALID_OD, + RESAMPLE_POINTS_PER_HOUR, + MINIMUM_VALID_SLOPE, + MINIMUM_PHASE_LENGTH, + PHASE_MIN_DELTA_LOG_OD, + CURVE_MINIMUM_DURATION_HOURS, +) from croissance.estimation.outliers import remove_outliers from croissance.estimation.ranking import rank_phases from croissance.estimation.regression import fit_exponential from croissance.estimation.smoothing import median_smoothing from croissance.estimation.smoothing.segments import segment_spline_smoothing -from croissance.estimation.util import savitzky_golay, resample, with_overhangs, points_per_hour +from croissance.estimation.util import ( + savitzky_golay, + resample, + with_overhangs, + points_per_hour, +) -class RawGrowthPhase(namedtuple('RawGrowthPhase', ('start', 'end'))): +class RawGrowthPhase(namedtuple("RawGrowthPhase", ("start", "end"))): __slots__ = () @property @@ -23,14 +34,11 @@ def duration(self): return self.end - self.start -class GrowthPhase(namedtuple('GrowthPhase', ( - 'start', - 'end', - 'slope', - 'intercept', - 'n0', - 'attributes' -))): +class GrowthPhase( + namedtuple( + "GrowthPhase", ("start", "end", "slope", "intercept", "n0", "attributes") + ) +): __slots__ = () @property @@ -38,7 +46,7 @@ def duration(self): return self.end - self.start @classmethod - def pick_best(cls, growth_phases, metric='duration'): + def pick_best(cls, growth_phases, metric="duration"): try: return sorted(growth_phases, key=attrgetter(metric), reverse=True)[0] except IndexError: @@ -48,21 +56,27 @@ def __getattr__(self, item): return self.attributes.get(item, None) -AnnotatedGrowthCurve = namedtuple('AnnotatedGrowthCurve', ('series', 'outliers', 'growth_phases')) +AnnotatedGrowthCurve = namedtuple( + "AnnotatedGrowthCurve", ("series", "outliers", "growth_phases") +) class Estimator(object): - def __init__(self, - *, - segment_log_n0: bool = False, - constrain_n0: bool = False, - n0: float = 0.0): + def __init__( + self, + *, + segment_log_n0: bool = False, + constrain_n0: bool = False, + n0: float = 0.0 + ): self._logger = logging.getLogger(__name__) self._segment_log_n0 = segment_log_n0 self._constrain_n0 = constrain_n0 self._n0 = n0 - def _find_growth_phases(self, curve: 'pandas.Series', window) -> 'typing.List[RawGrowthPhase]': + def _find_growth_phases( + self, curve: "pandas.Series", window + ) -> "typing.List[RawGrowthPhase]": """ Finds growth phases by locating regions in a series where both the first and second derivatives are positive. @@ -73,8 +87,10 @@ def _find_growth_phases(self, curve: 'pandas.Series', window) -> 'typing.List[Ra first_derivative = savitzky_golay(curve, window, 3, deriv=1) second_derivative = savitzky_golay(curve, window, 3, deriv=2) - growth = pandas.Series(index=curve.index, - data=(first_derivative.values > 0) & (second_derivative.values > 0)) + growth = pandas.Series( + index=curve.index, + data=(first_derivative.values > 0) & (second_derivative.values > 0), + ) istart, iend, phases = None, None, [] for i, v in zip(growth.index, growth.values): @@ -94,7 +110,9 @@ def _find_growth_phases(self, curve: 'pandas.Series', window) -> 'typing.List[Ra def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: series = curve.dropna() - n_hours = int(numpy.round(points_per_hour(series) * CURVE_MINIMUM_DURATION_HOURS)) + n_hours = int( + numpy.round(points_per_hour(series) * CURVE_MINIMUM_DURATION_HOURS) + ) if n_hours == 0: return AnnotatedGrowthCurve(series, [], []) @@ -112,7 +130,9 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: series_log_n0 = numpy.log(series - self._n0).dropna() smooth_series = segment_spline_smoothing(series, series_log_n0) else: - smooth_series = segment_spline_smoothing(series, ) + smooth_series = segment_spline_smoothing( + series, + ) phases = [] # give up if there isn't enough data @@ -121,7 +141,7 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: raw_phases = self._find_growth_phases(smooth_series, window=n_hours) for phase in raw_phases: - phase_series = series[phase.start:phase.end] + phase_series = series[phase.start : phase.end] # skip any growth phases that have not enough points for fitting if len(phase_series[phase_series > 0]) < 3: @@ -137,9 +157,13 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: # continue if self._constrain_n0: - slope, intercept, n0, snr, fallback_linear_method = fit_exponential(phase_series, n0=self._n0) + slope, intercept, n0, snr, fallback_linear_method = fit_exponential( + phase_series, n0=self._n0 + ) else: - slope, intercept, n0, snr, fallback_linear_method = fit_exponential(phase_series) + slope, intercept, n0, snr, fallback_linear_method = fit_exponential( + phase_series + ) # skip phases whose actual slope is below the limit if slope < max(0.0, defaults.PHASE_MINIMUM_SLOPE): @@ -149,13 +173,26 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: if snr < max(1.0, defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO): continue - phases.append(GrowthPhase(phase.start, phase.end, slope, intercept, n0, {"SNR": snr})) - - ranked_phases = rank_phases(phases, defaults.PHASE_RANK_WEIGHTS, thresholds={ - 'SNR': defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO, - 'duration': defaults.PHASE_MINIMUM_DURATION_HOURS, - 'slope': defaults.PHASE_MINIMUM_SLOPE - }) - - return AnnotatedGrowthCurve(series, outliers, [phase for phase in ranked_phases - if phase.rank >= defaults.PHASE_RANK_EXCLUDE_BELOW]) + phases.append( + GrowthPhase(phase.start, phase.end, slope, intercept, n0, {"SNR": snr}) + ) + + ranked_phases = rank_phases( + phases, + defaults.PHASE_RANK_WEIGHTS, + thresholds={ + "SNR": defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO, + "duration": defaults.PHASE_MINIMUM_DURATION_HOURS, + "slope": defaults.PHASE_MINIMUM_SLOPE, + }, + ) + + return AnnotatedGrowthCurve( + series, + outliers, + [ + phase + for phase in ranked_phases + if phase.rank >= defaults.PHASE_RANK_EXCLUDE_BELOW + ], + ) diff --git a/croissance/estimation/defaults.py b/croissance/estimation/defaults.py index d3a8172..f074476 100644 --- a/croissance/estimation/defaults.py +++ b/croissance/estimation/defaults.py @@ -1,4 +1,3 @@ - CURVE_MINIMUM_DURATION_HOURS = 5 PHASE_MINIMUM_SIGNAL_NOISE_RATIO = 1.0 @@ -7,9 +6,9 @@ PHASE_RANK_EXCLUDE_BELOW = 33 PHASE_RANK_WEIGHTS = { - 'SNR': 50, - 'duration': 30, - 'slope': 10, + "SNR": 50, + "duration": 30, + "slope": 10, # TODO add 1 - start? } @@ -24,4 +23,4 @@ PHASE_MIN_DELTA_OD = 0.25 MINIMUM_PHASE_LENGTH = 2 -MINOR_PHASE_MERGING_THRESHOLD = 0.10 # in percent +MINOR_PHASE_MERGING_THRESHOLD = 0.10 # in percent diff --git a/croissance/estimation/outliers.py b/croissance/estimation/outliers.py index 74d5749..0b99c82 100644 --- a/croissance/estimation/outliers.py +++ b/croissance/estimation/outliers.py @@ -16,10 +16,17 @@ def remove_outliers(series, window=30, std=2): return series, pandas.Series(data=[], index=[]) values = with_overhangs(series.values, window) - outliers = abs(values - values.rolling(window=window, center=True).median()) < values.rolling(window=window, center=True).std() * std + outliers = ( + abs(values - values.rolling(window=window, center=True).median()) + < values.rolling(window=window, center=True).std() * std + ) outlier_mask = outliers[window:-window].values - outliers = pandas.Series(data=series.values[~outlier_mask], index=series.index[~outlier_mask]) - series = pandas.Series(data=series.values[outlier_mask], index=series.index[outlier_mask]) + outliers = pandas.Series( + data=series.values[~outlier_mask], index=series.index[~outlier_mask] + ) + series = pandas.Series( + data=series.values[outlier_mask], index=series.index[outlier_mask] + ) - return series, outliers \ No newline at end of file + return series, outliers diff --git a/croissance/estimation/ranking.py b/croissance/estimation/ranking.py index 68dc2ea..4b86dca 100644 --- a/croissance/estimation/ranking.py +++ b/croissance/estimation/ranking.py @@ -15,13 +15,24 @@ def rank_phases(phases, weights, thresholds): values[attribute] = [getattr(phase, attribute) for phase in phases] for phase in phases: - scores.append((sum(weight * score([thresholds[attribute]] + values[attribute], getattr(phase, attribute)) - for attribute, weight in weights.items()) / sum(weights.values()), - phase)) + scores.append( + ( + sum( + weight + * score( + [thresholds[attribute]] + values[attribute], + getattr(phase, attribute), + ) + for attribute, weight in weights.items() + ) + / sum(weights.values()), + phase, + ) + ) ranked_phases = [] for rank, phase in sorted(scores, key=itemgetter(0), reverse=True): - phase.attributes['rank'] = rank + phase.attributes["rank"] = rank ranked_phases.append(phase) return ranked_phases diff --git a/croissance/estimation/regression.py b/croissance/estimation/regression.py index bea1059..a9c12b3 100644 --- a/croissance/estimation/regression.py +++ b/croissance/estimation/regression.py @@ -17,6 +17,7 @@ def fit_exponential(series, *, p0=(1.0, 0.01, 0.0), n0: float = None): if n0 is None: fit_fn = exponential else: + def exponential_constrain_n0(x, a, b): return a * numpy.exp(b * x) + n0 @@ -24,12 +25,16 @@ def exponential_constrain_n0(x, a, b): p0 = p0[:2] try: - popt, pcov = curve_fit(fit_fn, - series.index, - series.values, - p0=p0, - maxfev=10000, - bounds=([0., 0., 0.], numpy.inf) if n0 is None else ([0., 0.], numpy.inf)) + popt, pcov = curve_fit( + fit_fn, + series.index, + series.values, + p0=p0, + maxfev=10000, + bounds=([0.0, 0.0, 0.0], numpy.inf) + if n0 is None + else ([0.0, 0.0], numpy.inf), + ) if n0 is not None: popt = tuple(popt) + (n0,) @@ -47,7 +52,7 @@ def exponential_constrain_n0(x, a, b): log_series = numpy.log(series[series > 0] - (n0 or 0.0)) slope, c, *__ = linregress(log_series.index, log_series.values) - intercept = - c / slope + intercept = -c / slope if n0 is None: p0 = (numpy.exp(c), slope, 0.0) @@ -55,12 +60,16 @@ def exponential_constrain_n0(x, a, b): p0 = (numpy.exp(c), slope) try: - popt, pcov = curve_fit(fit_fn, - series.index, - series.values, - p0=p0, - maxfev=10000, - bounds=([0., 0., 0.], numpy.inf) if n0 is None else ([0., 0.], numpy.inf)) + popt, pcov = curve_fit( + fit_fn, + series.index, + series.values, + p0=p0, + maxfev=10000, + bounds=([0.0, 0.0, 0.0], numpy.inf) + if n0 is None + else ([0.0, 0.0], numpy.inf), + ) if n0 is not None: popt = tuple(popt) + (n0,) diff --git a/croissance/estimation/smoothing/__init__.py b/croissance/estimation/smoothing/__init__.py index 56a05f1..6a4c1e6 100644 --- a/croissance/estimation/smoothing/__init__.py +++ b/croissance/estimation/smoothing/__init__.py @@ -4,8 +4,10 @@ def median_smoothing(series, window=10): - data = pandas.rolling_apply(with_overhangs(series.values, window // 2), - window, - median_window_clean, - center=True)[window // 2:-window // 2] + data = pandas.rolling_apply( + with_overhangs(series.values, window // 2), + window, + median_window_clean, + center=True, + )[window // 2 : -window // 2] return pandas.Series(index=series.index, data=data) diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index 77a3168..389bb2b 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -22,8 +22,10 @@ def segment_by_std_dev(series, increment=2, maximum=20): for i in range(start, duration, increment): for size in range(1, maximum + 1): - window = detrend(series[i:i + size*increment]) - heappush(windows, (window.std() / (size*increment), i, i + size*increment)) + window = detrend(series[i : i + size * increment]) + heappush( + windows, (window.std() / (size * increment), i, i + size * increment) + ) segments = [] spots = set() @@ -66,7 +68,7 @@ def segment_points(series, segments): :param segments: :return: """ - out = [(series.index[0], numpy.median(series[:series.index[0] + 1]))] + out = [(series.index[0], numpy.median(series[: series.index[0] + 1]))] for start, end in segments: window = series[start:end] @@ -75,16 +77,18 @@ def segment_points(series, segments): continue if end - start > 5: - out.append(window_median(series[start:start + 2], start, start + 2)) + out.append(window_median(series[start : start + 2], start, start + 2)) if end - start > 11: - out.append(window_median(series[start + 2:end - 2], start + 2, end - 2)) + out.append( + window_median(series[start + 2 : end - 2], start + 2, end - 2) + ) - out.append(window_median(series[end - 2:end], end - 2, end)) + out.append(window_median(series[end - 2 : end], end - 2, end)) else: out.append(window_median(window, start, end)) - out += [(series.index[-1], numpy.max(series[series.index[0] - 1:]))] + out += [(series.index[-1], numpy.max(series[series.index[0] - 1 :]))] index, data = zip(*out) return pandas.Series(index=index, data=data) diff --git a/croissance/estimation/util.py b/croissance/estimation/util.py index 43ead1d..63adae7 100644 --- a/croissance/estimation/util.py +++ b/croissance/estimation/util.py @@ -5,7 +5,9 @@ def points_per_hour(series): - return 1 / numpy.median([series.index[i] - series.index[i - 1] for i in range(1, len(series))]) + return 1 / numpy.median( + [series.index[i] - series.index[i - 1] for i in range(1, len(series))] + ) def resample(series, *, factor=10, size=None): @@ -29,20 +31,25 @@ def resample(series, *, factor=10, size=None): def savitzky_golay(series, *args, **kwargs): - return pandas.Series(index=series.index, data=savgol_filter(series.values, *args, **kwargs)) + return pandas.Series( + index=series.index, data=savgol_filter(series.values, *args, **kwargs) + ) def with_overhangs(values, overhang_size): - start_overhang = numpy.repeat([numpy.median(values[0:overhang_size // 2 + 1])], overhang_size) - end_overhang = numpy.repeat([numpy.max(values[-1 - overhang_size // 2:-1])], overhang_size) + start_overhang = numpy.repeat( + [numpy.median(values[0 : overhang_size // 2 + 1])], overhang_size + ) + end_overhang = numpy.repeat( + [numpy.max(values[-1 - overhang_size // 2 : -1])], overhang_size + ) return pandas.Series(numpy.concatenate([start_overhang, values, end_overhang])) -def normalize_time_unit(curve: pandas.Series, unit: str = 'hours'): - if unit == 'hours': +def normalize_time_unit(curve: pandas.Series, unit: str = "hours"): + if unit == "hours": return curve - elif unit == 'minutes': - return pandas.Series(index=curve.index / 60., data=curve.values) + elif unit == "minutes": + return pandas.Series(index=curve.index / 60.0, data=curve.values) else: raise NotImplementedError("Unsupported time unit: '{}'".format(unit)) - diff --git a/croissance/figures/__init__.py b/croissance/figures/__init__.py index cb42fd1..d3dedc0 100644 --- a/croissance/figures/__init__.py +++ b/croissance/figures/__init__.py @@ -10,12 +10,10 @@ def __init__(self, file, include_shifted_exponentials: bool = False): self.doc = PdfPages(file) self._include_shifted_exponentials = include_shifted_exponentials - def write(self, - name: str, - curve: AnnotatedGrowthCurve): + def write(self, name: str, curve: AnnotatedGrowthCurve): fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(16, 16)) - axes[1].set_yscale('log') + axes[1].set_yscale("log") try: plt.title(name) @@ -24,21 +22,25 @@ def write(self, return for axis in axes: - axis.plot(curve.series.index, - curve.series.values, - color='black', - marker='.', - markersize=5, - linestyle='None') - - axis.plot(curve.outliers.index, - curve.outliers.values, - color='red', - marker='.', - markersize=5, - linestyle='None') - - colors = ['b', 'g', 'c', 'm', 'y', 'k'] + axis.plot( + curve.series.index, + curve.series.values, + color="black", + marker=".", + markersize=5, + linestyle="None", + ) + + axis.plot( + curve.outliers.index, + curve.outliers.values, + color="red", + marker=".", + markersize=5, + linestyle="None", + ) + + colors = ["b", "g", "c", "m", "y", "k"] for i, phase in enumerate(curve.growth_phases): color = colors[i % len(colors)] @@ -51,30 +53,63 @@ def write(self, def gf(x): return a * numpy.exp(phase.slope * x) + phase.n0 - phase_series = curve.series[phase.start:phase.end] + phase_series = curve.series[phase.start : phase.end] for axis in axes: if self._include_shifted_exponentials: - axis.plot(phase_series.index, - phase_series.values - phase.n0, - marker='.', linewidth=0, color='red') - - axis.plot(phase_series.index, gf(phase_series.index) - phase.n0, - marker=None, linewidth=2, color='red') - - axis.axhline(y=phase.n0, marker=None, linewidth=1, linestyle='dashed', color=color) - axis.axvline(x=phase.intercept, marker=None, linewidth=1, linestyle='dashed', color=color) - - axis.plot(phase_series.index, - phase_series.values, - marker=None, - linewidth=15, - color=color, - solid_capstyle='round', - alpha=0.2) - - axis.plot(curve.series.index, gf(curve.series.index), color=color, linewidth=1) - axis.plot(phase_series.index, gf(phase_series.index), color=color, linewidth=2) + axis.plot( + phase_series.index, + phase_series.values - phase.n0, + marker=".", + linewidth=0, + color="red", + ) + + axis.plot( + phase_series.index, + gf(phase_series.index) - phase.n0, + marker=None, + linewidth=2, + color="red", + ) + + axis.axhline( + y=phase.n0, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + axis.axvline( + x=phase.intercept, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + + axis.plot( + phase_series.index, + phase_series.values, + marker=None, + linewidth=15, + color=color, + solid_capstyle="round", + alpha=0.2, + ) + + axis.plot( + curve.series.index, + gf(curve.series.index), + color=color, + linewidth=1, + ) + axis.plot( + phase_series.index, + gf(phase_series.index), + color=color, + linewidth=2, + ) for axis in axes: axis.set_xlim(curve.series.index[0], curve.series.index[-1]) diff --git a/croissance/formats/input.py b/croissance/formats/input.py index 5d964eb..81c2073 100644 --- a/croissance/formats/input.py +++ b/croissance/formats/input.py @@ -2,12 +2,8 @@ class TSVReader(object): - def read(self, file): - data = pandas.read_csv(file, - sep="\t", - header=0, - index_col=0) + data = pandas.read_csv(file, sep="\t", header=0, index_col=0) names = data.columns diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 903ac05..11d67d8 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -5,27 +5,37 @@ class TSVWriter(object): def __init__(self, file, include_default_phase: bool = True): - writer = csv.writer(file, delimiter='\t', quoting=csv.QUOTE_MINIMAL) - writer.writerow(['name', 'phase', 'slope', 'intercept', 'N0', 'SNR', 'rank']) + writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) + writer.writerow(["name", "phase", "slope", "intercept", "N0", "SNR", "rank"]) self._writer = writer self._file = file self._include_default_phase = include_default_phase - def write(self, - name: str, - curve: AnnotatedGrowthCurve): + def write(self, name: str, curve: AnnotatedGrowthCurve): if self._include_default_phase: - phase = GrowthPhase.pick_best(curve.growth_phases, 'rank') + phase = GrowthPhase.pick_best(curve.growth_phases, "rank") if not phase: self._writer.writerow([name, 0, None, None, None]) else: - self._writer.writerow([name, 0, phase.slope, phase.intercept, phase.n0, phase.SNR, phase.rank]) + self._writer.writerow( + [ + name, + 0, + phase.slope, + phase.intercept, + phase.n0, + phase.SNR, + phase.rank, + ] + ) for i, phase in enumerate(curve.growth_phases, start=1): - self._writer.writerow([name, i, phase.slope, phase.intercept, phase.n0, phase.SNR, phase.rank]) + self._writer.writerow( + [name, i, phase.slope, phase.intercept, phase.n0, phase.SNR, phase.rank] + ) def __enter__(self): return self diff --git a/setup.py b/setup.py index 7a7986f..b559cfb 100644 --- a/setup.py +++ b/setup.py @@ -19,37 +19,37 @@ from setuptools import setup, find_packages setup( - name='croissance', - version='1.1.1', - packages=find_packages(exclude=['*tests*']), - url='https://github.com/biosustain/croissance', - author='Lars Schöning', - author_email='lays@biosustain.dtu.dk', - description='A tool for estimating growth rates in growth curves.', - long_description=codecs.open('README.rst', encoding='utf-8').read(), - license='Apache License Version 2.0', + name="croissance", + version="1.1.1", + packages=find_packages(exclude=["*tests*"]), + url="https://github.com/biosustain/croissance", + author="Lars Schöning", + author_email="lays@biosustain.dtu.dk", + description="A tool for estimating growth rates in growth curves.", + long_description=codecs.open("README.rst", encoding="utf-8").read(), + license="Apache License Version 2.0", entry_points={ - 'console_scripts': [ - 'croissance = croissance.__main__:main', + "console_scripts": [ + "croissance = croissance.__main__:main", ], }, - test_suite='nose.collector', + test_suite="nose.collector", tests_require=[ - 'nose>=1.1.2', + "nose>=1.1.2", ], install_requires=[ - 'numpy>=1.9.1', - 'pandas>=0.18.0', - 'scipy>=0.14.0', - 'matplotlib>=1.4.3', - 'tqdm>=4.11.2' + "numpy>=1.9.1", + "pandas>=0.18.0", + "scipy>=0.14.0", + "matplotlib>=1.4.3", + "tqdm>=4.11.2", ], classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Topic :: Utilities', - 'Intended Audience :: Science/Research', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'License :: OSI Approved :: Apache Software License', + "Development Status :: 5 - Production/Stable", + "Topic :: Utilities", + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "License :: OSI Approved :: Apache Software License", ], ) diff --git a/test.py b/test.py index a09a3e8..5982ebb 100644 --- a/test.py +++ b/test.py @@ -14,39 +14,41 @@ def test_regression_basic(self): for mu in (0.001, 0.10, 0.15, 0.50, 1.0): phase = pandas.Series( data=[numpy.exp(i * mu) for i in range(20)], - index=[i for i in range(20)]) + index=[i for i in range(20)], + ) slope, intercept, n0, snr, lin = fit_exponential(phase) print(slope, intercept, n0, snr, lin) self.assertFalse(lin, "This fit should not require a linear fallback.") - self.assertAlmostEqual(n0, 0, 2, msg='N0=0') + self.assertAlmostEqual(n0, 0, 2, msg="N0=0") self.assertAlmostEqual(intercept, 0, 2, msg='"intercept"=0') - self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) + self.assertAlmostEqual(mu, slope, 6, msg="growth rate (mu)={}".format(mu)) self.assertTrue(snr > 100000, "signal-noise ratio should be very good") def test_process_curve_basic(self): - with open('./test.basic.pdf', 'wb') as f: + with open("./test.basic.pdf", "wb") as f: with PDFWriter(f) as doc: - mu = .5 - pph = 4. + mu = 0.5 + pph = 4.0 curve = pandas.Series( data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)]) + index=[i / pph for i in range(100)], + ) - result = process_curve(curve, constrain_n0=True, n0=0.) + result = process_curve(curve, constrain_n0=True, n0=0.0) # self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) - doc.write('#0 n0=1', result) + doc.write("#0 n0=1", result) print(result.growth_phases) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0., result.growth_phases[0].n0, 3) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) self.assertTrue(result.growth_phases[0].SNR > 1000) result = process_curve(curve, constrain_n0=False) - doc.write('#0 n0=auto', result) + doc.write("#0 n0=auto", result) print(result.growth_phases) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) @@ -55,62 +57,70 @@ def test_process_curve_basic(self): curve = pandas.Series( data=( - [1.] * 5 + - [numpy.exp(mu * i / pph) for i in range(25)] + - [numpy.exp(mu * 24 / pph)] * 20), - index=([i / pph for i in range(50)])) - - result = process_curve(curve, constrain_n0=True, n0=0.) - doc.write('#1 n0=0', result) + [1.0] * 5 + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 20 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#1 n0=0", result) print(result.growth_phases) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0., result.growth_phases[0].n0, 3) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) self.assertTrue(result.growth_phases[0].SNR > 1000) result = process_curve(curve, constrain_n0=False) - doc.write('#1 n0=auto', result) + doc.write("#1 n0=auto", result) print(result.growth_phases) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 1) self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) self.assertTrue(result.growth_phases[0].SNR > 1000) - - mu = .20 + mu = 0.20 curve = pandas.Series( - data=([i / 10. for i in range(10)] + - [numpy.exp(mu * i / pph) for i in range(25)] + - [numpy.exp(mu * 24 / pph)] * 15), - index=([i / pph for i in range(50)])) - - result = process_curve(curve, constrain_n0=True, n0=0.) - doc.write('#2 n0=0', result) + data=( + [i / 10.0 for i in range(10)] + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 15 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#2 n0=0", result) print(result.growth_phases) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0., result.growth_phases[0].n0, 6) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 6) self.assertTrue(result.growth_phases[0].SNR > 1000) result = process_curve(curve, constrain_n0=False) - doc.write('#2 n0=auto', result) + doc.write("#2 n0=auto", result) self.assertEqual(len(result.growth_phases), 1) self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) self.assertTrue(result.growth_phases[0].SNR > 1000) def test_process_curve_wrong_time_unit(self): - mu = .5 + mu = 0.5 pph = 0.1 curve = pandas.Series( data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)]) + index=[i / pph for i in range(100)], + ) - result = process_curve(curve, constrain_n0=False, n0=0.) + result = process_curve(curve, constrain_n0=False, n0=0.0) self.assertEqual(len(result.growth_phases), 0) self.assertEqual(list(result.series), list(curve)) def test_normalize_time_unit(self): curve = pandas.Series(index=[0, 15, 30, 60], data=[1, 2, 3, 4]) - self.assertTrue(pandas.Series(index=[0., 0.25, 0.5, 1.], data=[1, 2, 3, 4]) - .equals(normalize_time_unit(curve, 'minutes'))) + self.assertTrue( + pandas.Series(index=[0.0, 0.25, 0.5, 1.0], data=[1, 2, 3, 4]).equals( + normalize_time_unit(curve, "minutes") + ) + ) From 124d090ab4437ee89ed185282f25f653a6f91f51 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 02/52] lints and minor cleanups --- croissance/__main__.py | 2 +- croissance/estimation/__init__.py | 18 ++++++------------ croissance/estimation/ranking.py | 2 +- croissance/estimation/regression.py | 4 ++-- croissance/estimation/smoothing/segments.py | 4 ++-- croissance/figures/__init__.py | 2 +- croissance/formats/input.py | 2 +- croissance/formats/output.py | 2 +- 8 files changed, 15 insertions(+), 21 deletions(-) diff --git a/croissance/__main__.py b/croissance/__main__.py index fb66ca2..83d86cb 100644 --- a/croissance/__main__.py +++ b/croissance/__main__.py @@ -110,7 +110,7 @@ def main(): "\nEmpty cells were found and discarded:\n", "\n".join( [ - (infile.name + "\t" + name) + (key + "\t" + name) for key, names in empties.items() for name in names ] diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index a132d29..c0fce5c 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -5,14 +5,8 @@ import numpy import pandas -from croissance.estimation.defaults import ( - MINIMUM_VALID_OD, - RESAMPLE_POINTS_PER_HOUR, - MINIMUM_VALID_SLOPE, - MINIMUM_PHASE_LENGTH, - PHASE_MIN_DELTA_LOG_OD, - CURVE_MINIMUM_DURATION_HOURS, -) +import croissance.estimation.defaults as defaults + from croissance.estimation.outliers import remove_outliers from croissance.estimation.ranking import rank_phases from croissance.estimation.regression import fit_exponential @@ -61,7 +55,7 @@ def __getattr__(self, item): ) -class Estimator(object): +class Estimator: def __init__( self, *, @@ -111,7 +105,7 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: series = curve.dropna() n_hours = int( - numpy.round(points_per_hour(series) * CURVE_MINIMUM_DURATION_HOURS) + numpy.round(points_per_hour(series) * defaults.CURVE_MINIMUM_DURATION_HOURS) ) if n_hours == 0: @@ -157,11 +151,11 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: # continue if self._constrain_n0: - slope, intercept, n0, snr, fallback_linear_method = fit_exponential( + slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( phase_series, n0=self._n0 ) else: - slope, intercept, n0, snr, fallback_linear_method = fit_exponential( + slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( phase_series ) diff --git a/croissance/estimation/ranking.py b/croissance/estimation/ranking.py index 4b86dca..5b9def5 100644 --- a/croissance/estimation/ranking.py +++ b/croissance/estimation/ranking.py @@ -11,7 +11,7 @@ def rank_phases(phases, weights, thresholds): values = {} scores = [] - for attribute, weight in weights.items(): + for attribute in weights: values[attribute] = [getattr(phase, attribute) for phase in phases] for phase in phases: diff --git a/croissance/estimation/regression.py b/croissance/estimation/regression.py index a9c12b3..833b15f 100644 --- a/croissance/estimation/regression.py +++ b/croissance/estimation/regression.py @@ -25,7 +25,7 @@ def exponential_constrain_n0(x, a, b): p0 = p0[:2] try: - popt, pcov = curve_fit( + popt, _pcov = curve_fit( fit_fn, series.index, series.values, @@ -60,7 +60,7 @@ def exponential_constrain_n0(x, a, b): p0 = (numpy.exp(c), slope) try: - popt, pcov = curve_fit( + popt, _pcov = curve_fit( fit_fn, series.index, series.values, diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index 389bb2b..da31560 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -32,7 +32,7 @@ def segment_by_std_dev(series, increment=2, maximum=20): try: while True: - window_agv_std, start, end = heappop(windows) + _window_agv_std, start, end = heappop(windows) if any(i in spots for i in range(start, int(end))): continue @@ -51,7 +51,7 @@ def segment_by_std_dev(series, increment=2, maximum=20): def window_median(window, start, end): x = numpy.linspace(0, 1, num=len(window)) A = numpy.vstack([x, numpy.ones(len(x))]).T - m, c = numpy.linalg.lstsq(A, window, rcond=None)[0] + m, _c = numpy.linalg.lstsq(A, window, rcond=None)[0] return (start + end) / 2, m * 0.5 + numpy.median(window - m * x) diff --git a/croissance/figures/__init__.py b/croissance/figures/__init__.py index d3dedc0..6f204c6 100644 --- a/croissance/figures/__init__.py +++ b/croissance/figures/__init__.py @@ -5,7 +5,7 @@ from croissance.estimation import AnnotatedGrowthCurve -class PDFWriter(object): +class PDFWriter: def __init__(self, file, include_shifted_exponentials: bool = False): self.doc = PdfPages(file) self._include_shifted_exponentials = include_shifted_exponentials diff --git a/croissance/formats/input.py b/croissance/formats/input.py index 81c2073..b631bdc 100644 --- a/croissance/formats/input.py +++ b/croissance/formats/input.py @@ -1,7 +1,7 @@ import pandas -class TSVReader(object): +class TSVReader: def read(self, file): data = pandas.read_csv(file, sep="\t", header=0, index_col=0) diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 11d67d8..415beee 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -3,7 +3,7 @@ from croissance.estimation import GrowthPhase, AnnotatedGrowthCurve -class TSVWriter(object): +class TSVWriter: def __init__(self, file, include_default_phase: bool = True): writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) writer.writerow(["name", "phase", "slope", "intercept", "N0", "SNR", "rank"]) From c897dbcfba6146a9aec3052a06276d5af92a24c2 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 03/52] cleanup main/argument parsing --- croissance/__main__.py | 124 ----------------------------------------- croissance/main.py | 119 +++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 3 files changed, 120 insertions(+), 125 deletions(-) delete mode 100644 croissance/__main__.py create mode 100644 croissance/main.py diff --git a/croissance/__main__.py b/croissance/__main__.py deleted file mode 100644 index 83d86cb..0000000 --- a/croissance/__main__.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -import argparse - -import sys - -from tqdm import tqdm - -from croissance import Estimator -from croissance.estimation.util import normalize_time_unit -from croissance.figures import PDFWriter -from croissance.formats.input import TSVReader -from croissance.formats.output import TSVWriter - -parser = argparse.ArgumentParser(description="Estimate growth rates in growth curves") -parser.add_argument("infiles", type=argparse.FileType("r"), nargs="+") -parser.add_argument("--input-format", type=str, default="tsv") -parser.add_argument( - "--input-time-unit", - choices=["hours", "minutes"], - default="hours", - help="Time unit in time column", -) -parser.add_argument( - "--N0", - type=float, - default=0.0, - help="Baseline for constrained regressions & identifying curve segments in log space.", -) -parser.add_argument( - "--constrain-N0", - action="store_true", - help="All growth phases will begin at N0 when this flag is set.", -) -parser.add_argument( - "--segment-log-N0", - action="store_true", - help="Identify curve segments using log(N-N0) rather than N; " - "increases sensitivity to changes in exponential growth but has to assume a certain N0.", -) -parser.add_argument("--output-format", type=str, default="tsv") -parser.add_argument("--output-suffix", type=str, default=".output") -parser.add_argument( - "--output-exclude-default-phase", - type=bool, - default=False, - help="The output will not include a phase '0' for each curve when this flag is set.", -) -parser.add_argument( - "--figures", - action="store_true", - help="Renders a PDF file with figures for each curve.", -) - - -def main(): - args = parser.parse_args() - - if args.input_format.lower() != "tsv": - raise NotImplementedError() - - if args.output_format.lower() != "tsv": - raise NotImplementedError() - - reader = TSVReader() - - estimator = Estimator( - constrain_n0=args.constrain_N0, segment_log_n0=args.segment_log_N0, n0=args.N0 - ) - - try: - empties = {} - for infile in tqdm(args.infiles, unit="infile"): - outfile = open("{}{}.tsv".format(infile.name[:-4], args.output_suffix), "w") - - if args.figures: - figfile = open( - "{}{}.pdf".format(infile.name[:-4], args.output_suffix), "wb" - ) - figwriter = PDFWriter(figfile) - - outwriter = TSVWriter( - outfile, include_default_phase=not args.output_exclude_default_phase - ) - - for name, curve in tqdm(list(reader.read(infile)), unit="curve"): - if curve.empty: - try: - empties[infile.name].append(name) - except KeyError: - empties[infile.name] = [name] - continue - annotated_curve = estimator.growth( - normalize_time_unit(curve, args.input_time_unit) - ) - - outwriter.write(name, annotated_curve) - - if args.figures: - figwriter.write(name, annotated_curve) - - outwriter.close() - - if args.figures: - figwriter.close() - except KeyboardInterrupt: - pass - - if empties != {}: - print( - "\nEmpty cells were found and discarded:\n", - "\n".join( - [ - (key + "\t" + name) - for key, names in empties.items() - for name in names - ] - ), - ) - else: - print() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/croissance/main.py b/croissance/main.py new file mode 100644 index 0000000..2df4389 --- /dev/null +++ b/croissance/main.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +import argparse +import collections +import sys + +from pathlib import Path + +from tqdm import tqdm + +from croissance import Estimator +from croissance.estimation.util import normalize_time_unit +from croissance.figures import PDFWriter +from croissance.formats.input import TSVReader +from croissance.formats.output import TSVWriter + + +def parse_args(argv): + parser = argparse.ArgumentParser( + description="Estimate growth rates in growth curves", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("infiles", type=Path, nargs="+") + parser.add_argument("--input-format", type=str.lower, choices=("tsv",)) + parser.add_argument( + "--input-time-unit", + default="hours", + choices=("hours", "minutes"), + help="Time unit in time column", + ) + parser.add_argument( + "--N0", + type=float, + default=0.0, + help="Baseline for constrained regressions and identifying curve segments in " + "log space", + ) + parser.add_argument( + "--constrain-N0", + action="store_true", + help="All growth phases will begin at N0 when this flag is set", + ) + parser.add_argument( + "--segment-log-N0", + action="store_true", + help="Identify curve segments using log(N-N0) rather than N; increases " + "sensitivity to changes in exponential growth but has to assume a certain N0", + ) + parser.add_argument("--output-format", type=str.lower, choices=("tsv",)) + parser.add_argument("--output-suffix", type=str, default=".output") + parser.add_argument( + "--output-exclude-default-phase", + type=bool, + default=False, + help="Do not output phase '0' for each curve", + ) + parser.add_argument( + "--figures", + action="store_true", + help="Renders a PDF file with figures for each curve", + ) + + return parser.parse_args(argv) + + +def main(argv): + args = parse_args(argv) + reader = TSVReader() + + estimator = Estimator( + constrain_n0=args.constrain_N0, segment_log_n0=args.segment_log_N0, n0=args.N0 + ) + + empty_cells = collections.defaultdict(list) + for filepath in tqdm(args.infiles, unit="infile"): + with filepath.open("rt") as infile: + curves = list(reader.read(infile)) + + annotated_curves = {} + for name, curve in tqdm(curves, unit="curve"): + if curve.empty: + empty_cells[infile.name].append(name) + continue + + annotated_curves[name] = estimator.growth( + normalize_time_unit(curve, args.input_time_unit) + ) + + output_filepath = filepath.with_suffix(args.output_suffix + ".tsv") + with output_filepath.open("wt") as outfile: + outwriter = TSVWriter( + outfile, include_default_phase=not args.output_exclude_default_phase + ) + + for name, annotated_curve in annotated_curves.items(): + outwriter.write(name, annotated_curve) + + if args.figures: + figure_filepath = filepath.with_suffix(args.output_suffix + ".pdf") + with figure_filepath.open("wb") as figfile: + with PDFWriter(figfile) as figwriter: + for name, annotated_curve in annotated_curves.items(): + figwriter.write(name, annotated_curve) + + print() + if empty_cells: + print("Empty cells were found and discarded:\n") + + for key, names in empty_cells.items(): + for name in names: + print(key + "\t" + name) + + +def entry_point(): + sys.exit(main(sys.argv[1:])) + + +if __name__ == "__main__": + entry_point() diff --git a/setup.py b/setup.py index b559cfb..6f1ee40 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ license="Apache License Version 2.0", entry_points={ "console_scripts": [ - "croissance = croissance.__main__:main", + "croissance = croissance.main:entry_point", ], }, test_suite="nose.collector", From 243024ee1f8d72ac4d528531fd7bad842bb2a7b3 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 04/52] breaking: drop argument from --- croissance/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/croissance/main.py b/croissance/main.py index 2df4389..a884025 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -50,8 +50,7 @@ def parse_args(argv): parser.add_argument("--output-suffix", type=str, default=".output") parser.add_argument( "--output-exclude-default-phase", - type=bool, - default=False, + action="store_true", help="Do not output phase '0' for each curve", ) parser.add_argument( From 82ccb726af7f6917862de8f35d987f38d59ed9d8 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 05/52] add `unit` parameter to `process_curve` This makes it explicit that the function expects hours as the time unit by default and allows easy conversion. Also replaced pass-through kwargs with named arguments for ease of use. This fixes #6 --- croissance/__init__.py | 22 ++++++++++++++++++---- test.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index a365d3c..db9ad1a 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -1,13 +1,27 @@ -from croissance.estimation import Estimator from collections import namedtuple +from croissance.estimation import Estimator +from croissance.estimation.util import normalize_time_unit + + AnnotatedGrowthCurve = namedtuple( "AnnotatedGrowthCurve", ("series", "outliers", "growth_phases") ) -def process_curve(curve: "pandas.Series", **kwargs): - estimator = Estimator(**kwargs) +def process_curve( + curve: "pandas.Series", + segment_log_n0: bool = False, + constrain_n0: bool = False, + n0: float = 0.0, + unit: str = "hours", +): + curve = normalize_time_unit(curve, unit) if curve.isnull().all(): return AnnotatedGrowthCurve(curve, [], []) - return estimator.growth(curve) + + return Estimator( + segment_log_n0=segment_log_n0, + constrain_n0=constrain_n0, + n0=n0, + ).growth(curve) diff --git a/test.py b/test.py index 5982ebb..1f5edc2 100644 --- a/test.py +++ b/test.py @@ -3,7 +3,7 @@ import numpy import pandas -from croissance import process_curve +from croissance import process_curve, AnnotatedGrowthCurve from croissance.estimation import fit_exponential from croissance.estimation.util import normalize_time_unit from croissance.figures import PDFWriter @@ -26,6 +26,39 @@ def test_regression_basic(self): self.assertAlmostEqual(mu, slope, 6, msg="growth rate (mu)={}".format(mu)) self.assertTrue(snr > 100000, "signal-noise ratio should be very good") + def test_process_curve_empty_series(self): + result = process_curve(pandas.Series([], dtype="float64")) + + self.assertTrue(result.series.empty) + self.assertEqual(result.outliers, []) + self.assertEqual(result.growth_phases, []) + + def test_process_curve_null_series(self): + curve = pandas.Series(index=[1, 2], data=[None, float("NaN")]) + result = process_curve(curve) + + self.assertIs(result.series, curve) + self.assertEqual(result.outliers, []) + self.assertEqual(result.growth_phases, []) + + def test_process_curve_time_unit(self): + mu = 0.5 + pph = 4.0 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + hours = process_curve(curve, unit="hours") + minutes = process_curve( + pandas.Series(index=curve.index * 60.0, data=curve.values), + unit="minutes", + ) + + self.assertTrue(hours.series.equals(minutes.series)) + self.assertTrue(hours.outliers.equals(minutes.outliers)) + self.assertEqual(hours.growth_phases, minutes.growth_phases) + def test_process_curve_basic(self): with open("./test.basic.pdf", "wb") as f: with PDFWriter(f) as doc: From 74754e3f108e1d6e624ea0972ce9781f4f1ca96b Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 06/52] initial work on adding proper logging --- croissance/estimation/__init__.py | 13 ++++----- croissance/main.py | 46 +++++++++++++++++++++---------- setup.py | 4 +-- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index c0fce5c..f6f174d 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -63,7 +63,7 @@ def __init__( constrain_n0: bool = False, n0: float = 0.0 ): - self._logger = logging.getLogger(__name__) + self._log = logging.getLogger(__name__) self._segment_log_n0 = segment_log_n0 self._constrain_n0 = constrain_n0 self._n0 = n0 @@ -72,11 +72,8 @@ def _find_growth_phases( self, curve: "pandas.Series", window ) -> "typing.List[RawGrowthPhase]": """ - Finds growth phases by locating regions in a series where both the first and second derivatives are positive. - - :param curve: - :param window: - :return: + Finds growth phases by locating regions in a series where both the first and + second derivatives are positive. """ first_derivative = savitzky_golay(curve, window, 3, deriv=1) second_derivative = savitzky_golay(curve, window, 3, deriv=2) @@ -109,6 +106,7 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: ) if n_hours == 0: + self._log.warning("Less than 1 data-point/hour; cannot proceed") return AnnotatedGrowthCurve(series, [], []) if n_hours % 2 == 0: @@ -118,6 +116,7 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: # NOTE workaround for issue with negative curves if len(series[series > 0]) < 3: + self._log.warning("Less than 3 data-points post filtering; cannot proceeed") return AnnotatedGrowthCurve(series, outliers, []) if self._segment_log_n0: @@ -129,8 +128,8 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: ) phases = [] - # give up if there isn't enough data if len(smooth_series) < n_hours: + self._log.warning("Insufficient data, cannot determine growth") return AnnotatedGrowthCurve(series, [], []) raw_phases = self._find_growth_phases(smooth_series, window=n_hours) diff --git a/croissance/main.py b/croissance/main.py index a884025..8e30583 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -1,11 +1,11 @@ #!/usr/bin/env python import argparse -import collections +import logging import sys from pathlib import Path -from tqdm import tqdm +import coloredlogs from croissance import Estimator from croissance.estimation.util import normalize_time_unit @@ -59,28 +59,50 @@ def parse_args(argv): help="Renders a PDF file with figures for each curve", ) + group = parser.add_argument_group("Logging") + group.add_argument( + "--log-level", + type=str.upper, + default="INFO", + choices=("DEBUG", "INFO", "WARNING", "ERROR"), + help="Set verbosity of log messages", + ) + return parser.parse_args(argv) +def setup_logging(level): + coloredlogs.install( + fmt="%(asctime)s %(name)s %(levelname)s %(message)s", + level=level, + ) + + return logging.getLogger("croissance") + + def main(argv): args = parse_args(argv) - reader = TSVReader() + log = setup_logging(level=args.log_level) + reader = TSVReader() estimator = Estimator( - constrain_n0=args.constrain_N0, segment_log_n0=args.segment_log_N0, n0=args.N0 + constrain_n0=args.constrain_N0, + segment_log_n0=args.segment_log_N0, + n0=args.N0, ) - empty_cells = collections.defaultdict(list) - for filepath in tqdm(args.infiles, unit="infile"): + for filepath in args.infiles: + log.info("Reading curves from '%s", filepath) with filepath.open("rt") as infile: curves = list(reader.read(infile)) annotated_curves = {} - for name, curve in tqdm(curves, unit="curve"): + for name, curve in curves: if curve.empty: - empty_cells[infile.name].append(name) + log.warning("Skipping empty curve %r", name) continue + log.info("Estimating growth for curve %r", name) annotated_curves[name] = estimator.growth( normalize_time_unit(curve, args.input_time_unit) ) @@ -101,13 +123,7 @@ def main(argv): for name, annotated_curve in annotated_curves.items(): figwriter.write(name, annotated_curve) - print() - if empty_cells: - print("Empty cells were found and discarded:\n") - - for key, names in empty_cells.items(): - for name in names: - print(key + "\t" + name) + log.info("Done ..") def entry_point(): diff --git a/setup.py b/setup.py index 6f1ee40..f87ed62 100644 --- a/setup.py +++ b/setup.py @@ -38,11 +38,11 @@ "nose>=1.1.2", ], install_requires=[ + "coloredlogs>=14.0.0", + "matplotlib>=1.4.3", "numpy>=1.9.1", "pandas>=0.18.0", "scipy>=0.14.0", - "matplotlib>=1.4.3", - "tqdm>=4.11.2", ], classifiers=[ "Development Status :: 5 - Production/Stable", From a5996519865f99359ebcd4c44f156576792f63a4 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 07/52] rework I/O handling File handling logic is moved to the individual classes, to make it easier to add support for other file types with different requirements. --- croissance/figures/__init__.py | 10 ++- croissance/formats/input.py | 18 ++-- croissance/formats/output.py | 20 +++-- croissance/main.py | 21 ++--- test.py | 154 ++++++++++++++++----------------- 5 files changed, 115 insertions(+), 108 deletions(-) diff --git a/croissance/figures/__init__.py b/croissance/figures/__init__.py index 6f204c6..a64f0b8 100644 --- a/croissance/figures/__init__.py +++ b/croissance/figures/__init__.py @@ -6,8 +6,9 @@ class PDFWriter: - def __init__(self, file, include_shifted_exponentials: bool = False): - self.doc = PdfPages(file) + def __init__(self, filepath, include_shifted_exponentials: bool = False): + self._handle = filepath.open("wb") + self._doc = PdfPages(self._handle) self._include_shifted_exponentials = include_shifted_exponentials def write(self, name: str, curve: AnnotatedGrowthCurve): @@ -117,7 +118,7 @@ def gf(x): axes[0].set_ylim([max(curve.series.min(), -2.5), curve.series.max()]) axes[1].set_ylim([0.1, curve.series.max()]) finally: - self.doc.savefig(fig) + self._doc.savefig(fig) plt.close() def __enter__(self): @@ -127,4 +128,5 @@ def __exit__(self, *args, **kwargs): self.close() def close(self): - self.doc.close() + self._doc.close() + self._handle.close() diff --git a/croissance/formats/input.py b/croissance/formats/input.py index b631bdc..25b5df8 100644 --- a/croissance/formats/input.py +++ b/croissance/formats/input.py @@ -2,11 +2,17 @@ class TSVReader: - def read(self, file): - data = pandas.read_csv(file, sep="\t", header=0, index_col=0) + def __init__(self, filepath): + self._filepath = filepath - names = data.columns + def read(self): + with self._filepath.open("rt") as handle: + data = pandas.read_csv(handle, sep="\t", header=0, index_col=0) - for name in names: - curve = data[name].dropna() - yield name, curve + return [(name, data[name].dropna()) for name in data.columns] + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + pass diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 415beee..1aa7ae5 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -4,17 +4,19 @@ class TSVWriter: - def __init__(self, file, include_default_phase: bool = True): - writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) - writer.writerow(["name", "phase", "slope", "intercept", "N0", "SNR", "rank"]) + def __init__(self, filepath, exclude_default_phase: bool = True): + self._exclude_default_phase = exclude_default_phase + self._handle = filepath.open("wt") + self._writer = csv.writer( + self._handle, delimiter="\t", quoting=csv.QUOTE_MINIMAL + ) - self._writer = writer - self._file = file - self._include_default_phase = include_default_phase + self._writer.writerow( + ["name", "phase", "slope", "intercept", "N0", "SNR", "rank"] + ) def write(self, name: str, curve: AnnotatedGrowthCurve): - - if self._include_default_phase: + if not self._exclude_default_phase: phase = GrowthPhase.pick_best(curve.growth_phases, "rank") if not phase: @@ -44,4 +46,4 @@ def __exit__(self, *args, **kwargs): self.close() def close(self): - self._file.close() + self._handle.close() diff --git a/croissance/main.py b/croissance/main.py index 8e30583..9ae9609 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import argparse +import collections import logging import sys @@ -84,17 +85,17 @@ def main(argv): args = parse_args(argv) log = setup_logging(level=args.log_level) - reader = TSVReader() estimator = Estimator( constrain_n0=args.constrain_N0, segment_log_n0=args.segment_log_N0, n0=args.N0, ) + log.info("Estimating growth curves ..") for filepath in args.infiles: log.info("Reading curves from '%s", filepath) - with filepath.open("rt") as infile: - curves = list(reader.read(infile)) + with TSVReader(filepath) as reader: + curves = reader.read() annotated_curves = {} for name, curve in curves: @@ -108,20 +109,16 @@ def main(argv): ) output_filepath = filepath.with_suffix(args.output_suffix + ".tsv") - with output_filepath.open("wt") as outfile: - outwriter = TSVWriter( - outfile, include_default_phase=not args.output_exclude_default_phase - ) - + with TSVWriter(output_filepath, args.output_exclude_default_phase) as outwriter: for name, annotated_curve in annotated_curves.items(): outwriter.write(name, annotated_curve) if args.figures: figure_filepath = filepath.with_suffix(args.output_suffix + ".pdf") - with figure_filepath.open("wb") as figfile: - with PDFWriter(figfile) as figwriter: - for name, annotated_curve in annotated_curves.items(): - figwriter.write(name, annotated_curve) + + with PDFWriter(figure_filepath) as figwriter: + for name, annotated_curve in annotated_curves.items(): + figwriter.write(name, annotated_curve) log.info("Done ..") diff --git a/test.py b/test.py index 1f5edc2..ee81438 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,4 @@ +from pathlib import Path from unittest import TestCase import numpy @@ -60,83 +61,82 @@ def test_process_curve_time_unit(self): self.assertEqual(hours.growth_phases, minutes.growth_phases) def test_process_curve_basic(self): - with open("./test.basic.pdf", "wb") as f: - with PDFWriter(f) as doc: - mu = 0.5 - pph = 4.0 - curve = pandas.Series( - data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)], - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - - # self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) - - doc.write("#0 n0=1", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#0 n0=auto", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - curve = pandas.Series( - data=( - [1.0] * 5 - + [numpy.exp(mu * i / pph) for i in range(25)] - + [numpy.exp(mu * 24 / pph)] * 20 - ), - index=([i / pph for i in range(50)]), - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - doc.write("#1 n0=0", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#1 n0=auto", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 1) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - mu = 0.20 - curve = pandas.Series( - data=( - [i / 10.0 for i in range(10)] - + [numpy.exp(mu * i / pph) for i in range(25)] - + [numpy.exp(mu * 24 / pph)] * 15 - ), - index=([i / pph for i in range(50)]), - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - doc.write("#2 n0=0", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 6) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#2 n0=auto", result) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) + with PDFWriter(Path("test.basic.pdf")) as doc: + mu = 0.5 + pph = 4.0 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + + # self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) + + doc.write("#0 n0=1", result) + print(result.growth_phases) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) + self.assertTrue(result.growth_phases[0].SNR > 1000) + + result = process_curve(curve, constrain_n0=False) + doc.write("#0 n0=auto", result) + print(result.growth_phases) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) + self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) + self.assertTrue(result.growth_phases[0].SNR > 1000) + + curve = pandas.Series( + data=( + [1.0] * 5 + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 20 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#1 n0=0", result) + print(result.growth_phases) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) + self.assertTrue(result.growth_phases[0].SNR > 1000) + + result = process_curve(curve, constrain_n0=False) + doc.write("#1 n0=auto", result) + print(result.growth_phases) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 1) + self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) + self.assertTrue(result.growth_phases[0].SNR > 1000) + + mu = 0.20 + curve = pandas.Series( + data=( + [i / 10.0 for i in range(10)] + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 15 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#2 n0=0", result) + print(result.growth_phases) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) + self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 6) + self.assertTrue(result.growth_phases[0].SNR > 1000) + + result = process_curve(curve, constrain_n0=False) + doc.write("#2 n0=auto", result) + self.assertEqual(len(result.growth_phases), 1) + self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) + self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) + self.assertTrue(result.growth_phases[0].SNR > 1000) def test_process_curve_wrong_time_unit(self): mu = 0.5 From 5ef906aacdef14f2ba4174c6863036445b2fa05e Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 08/52] run tests using tox/pytest --- setup.py | 4 ---- tox.ini | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 tox.ini diff --git a/setup.py b/setup.py index f87ed62..762e29c 100644 --- a/setup.py +++ b/setup.py @@ -33,10 +33,6 @@ "croissance = croissance.main:entry_point", ], }, - test_suite="nose.collector", - tests_require=[ - "nose>=1.1.2", - ], install_requires=[ "coloredlogs>=14.0.0", "matplotlib>=1.4.3", diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..a86f9b8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,15 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py3 + +[testenv] +commands = + pytest test.py + +deps = + pytest + nose >= 1.1.2 From 6e907df0796c5bf36cc7b4536093960fcb2f87d0 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 09/52] update gitignore --- .gitignore | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 514a598..56ec7f4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,7 @@ *.so # Packages -*.egg -*.egg-info +*.egg* dist build eggs @@ -18,6 +17,7 @@ develop-eggs lib lib64 __pycache__ +venv # Installer logs pip-log.txt @@ -35,8 +35,9 @@ nosetests.xml .project .pydevproject -# PyCharm +# IDEs .idea +.vscode # Test output test.*.pdf \ No newline at end of file From d44402c754ddd8a655423d319cfeff9de7b89b96 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 10/52] split plotting into plotting and pdf writing --- croissance/figures/__init__.py | 132 --------------------------------- croissance/figures/plot.py | 115 ++++++++++++++++++++++++++++ croissance/figures/writer.py | 35 +++++++++ 3 files changed, 150 insertions(+), 132 deletions(-) create mode 100644 croissance/figures/plot.py create mode 100644 croissance/figures/writer.py diff --git a/croissance/figures/__init__.py b/croissance/figures/__init__.py index a64f0b8..e69de29 100644 --- a/croissance/figures/__init__.py +++ b/croissance/figures/__init__.py @@ -1,132 +0,0 @@ -import numpy -from matplotlib.backends.backend_pdf import PdfPages -import matplotlib.pyplot as plt - -from croissance.estimation import AnnotatedGrowthCurve - - -class PDFWriter: - def __init__(self, filepath, include_shifted_exponentials: bool = False): - self._handle = filepath.open("wb") - self._doc = PdfPages(self._handle) - self._include_shifted_exponentials = include_shifted_exponentials - - def write(self, name: str, curve: AnnotatedGrowthCurve): - - fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(16, 16)) - axes[1].set_yscale("log") - - try: - plt.title(name) - - if curve.series.max() <= 0: - return - - for axis in axes: - axis.plot( - curve.series.index, - curve.series.values, - color="black", - marker=".", - markersize=5, - linestyle="None", - ) - - axis.plot( - curve.outliers.index, - curve.outliers.values, - color="red", - marker=".", - markersize=5, - linestyle="None", - ) - - colors = ["b", "g", "c", "m", "y", "k"] - - for i, phase in enumerate(curve.growth_phases): - color = colors[i % len(colors)] - - # if phase['exclude'] is True: - # color = 'gray' - - a = 1 / numpy.exp(phase.intercept * phase.slope) - - def gf(x): - return a * numpy.exp(phase.slope * x) + phase.n0 - - phase_series = curve.series[phase.start : phase.end] - - for axis in axes: - if self._include_shifted_exponentials: - axis.plot( - phase_series.index, - phase_series.values - phase.n0, - marker=".", - linewidth=0, - color="red", - ) - - axis.plot( - phase_series.index, - gf(phase_series.index) - phase.n0, - marker=None, - linewidth=2, - color="red", - ) - - axis.axhline( - y=phase.n0, - marker=None, - linewidth=1, - linestyle="dashed", - color=color, - ) - axis.axvline( - x=phase.intercept, - marker=None, - linewidth=1, - linestyle="dashed", - color=color, - ) - - axis.plot( - phase_series.index, - phase_series.values, - marker=None, - linewidth=15, - color=color, - solid_capstyle="round", - alpha=0.2, - ) - - axis.plot( - curve.series.index, - gf(curve.series.index), - color=color, - linewidth=1, - ) - axis.plot( - phase_series.index, - gf(phase_series.index), - color=color, - linewidth=2, - ) - - for axis in axes: - axis.set_xlim(curve.series.index[0], curve.series.index[-1]) - - axes[0].set_ylim([max(curve.series.min(), -2.5), curve.series.max()]) - axes[1].set_ylim([0.1, curve.series.max()]) - finally: - self._doc.savefig(fig) - plt.close() - - def __enter__(self): - return self - - def __exit__(self, *args, **kwargs): - self.close() - - def close(self): - self._doc.close() - self._handle.close() diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py new file mode 100644 index 0000000..fafa370 --- /dev/null +++ b/croissance/figures/plot.py @@ -0,0 +1,115 @@ +import matplotlib.pyplot as plt +import numpy + +from croissance.estimation import AnnotatedGrowthCurve + + +def plot_processed_curve( + curve: AnnotatedGrowthCurve, + name: str, + figsize=(16, 16), + include_shifted_exponentials: bool = False, +): + fig, axes = plt.subplots(nrows=2, ncols=1, figsize=figsize) + + axes[1].set_yscale("log") + + fig.suptitle(name) + + if curve.series.max() <= 0: + return + + for axis in axes: + axis.plot( + curve.series.index, + curve.series.values, + color="black", + marker=".", + markersize=5, + linestyle="None", + ) + + axis.plot( + curve.outliers.index, + curve.outliers.values, + color="red", + marker=".", + markersize=5, + linestyle="None", + ) + + colors = ["b", "g", "c", "m", "y", "k"] + + for i, phase in enumerate(curve.growth_phases): + color = colors[i % len(colors)] + + a = 1 / numpy.exp(phase.intercept * phase.slope) + + def gf(x): + return a * numpy.exp(phase.slope * x) + phase.n0 + + phase_series = curve.series[phase.start : phase.end] + + for axis in axes: + if include_shifted_exponentials: + axis.plot( + phase_series.index, + phase_series.values - phase.n0, + marker=".", + linewidth=0, + color="red", + ) + + axis.plot( + phase_series.index, + gf(phase_series.index) - phase.n0, + marker=None, + linewidth=2, + color="red", + ) + + axis.axhline( + y=phase.n0, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + axis.axvline( + x=phase.intercept, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + + axis.plot( + phase_series.index, + phase_series.values, + marker=None, + linewidth=15, + color=color, + solid_capstyle="round", + alpha=0.2, + ) + + axis.plot( + curve.series.index, + gf(curve.series.index), + color=color, + linewidth=1, + ) + axis.plot( + phase_series.index, + gf(phase_series.index), + color=color, + linewidth=2, + ) + + for axis in axes: + axis.set_xlim(curve.series.index[0], curve.series.index[-1]) + + axes[0].set_ylim([max(curve.series.min(), -2.5), curve.series.max()]) + axes[1].set_ylim([0.1, curve.series.max()]) + + return fig, axes diff --git a/croissance/figures/writer.py b/croissance/figures/writer.py new file mode 100644 index 0000000..f3ad4f3 --- /dev/null +++ b/croissance/figures/writer.py @@ -0,0 +1,35 @@ +import matplotlib.pyplot as plt + +from matplotlib.backends.backend_pdf import PdfPages + +from croissance.estimation import AnnotatedGrowthCurve +from croissance.figures.plot import plot_processed_curve + + +class PDFWriter: + def __init__(self, filepath, include_shifted_exponentials: bool = False): + self._handle = filepath.open("wb") + self._doc = PdfPages(self._handle) + self._include_shifted_exponentials = include_shifted_exponentials + + def write(self, name: str, curve: AnnotatedGrowthCurve): + fig, _axes = plot_processed_curve( + curve=curve, + name=name, + include_shifted_exponentials=self._include_shifted_exponentials, + ) + + try: + self._doc.savefig(fig) + finally: + plt.close(fig) + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + self.close() + + def close(self): + self._doc.close() + self._handle.close() From 3d91a98d2b3b5a8ff7ca1ab33bceea4e638404f1 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 11/52] expose plotting function to library users This fixes #7 --- croissance/__init__.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index db9ad1a..92096db 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -1,12 +1,13 @@ -from collections import namedtuple +import croissance.figures.plot -from croissance.estimation import Estimator +from croissance.estimation import Estimator, AnnotatedGrowthCurve from croissance.estimation.util import normalize_time_unit -AnnotatedGrowthCurve = namedtuple( - "AnnotatedGrowthCurve", ("series", "outliers", "growth_phases") -) +__all__ = [ + "plot_processed_curve", + "process_curve", +] def process_curve( @@ -25,3 +26,17 @@ def process_curve( constrain_n0=constrain_n0, n0=n0, ).growth(curve) + + +def plot_processed_curve( + curve: AnnotatedGrowthCurve, + name: str = None, + figsize=(10, 10), + include_shifted_exponentials: bool = False, +): + return croissance.figures.plot.plot_processed_curve( + curve=curve, + name=name, + figsize=figsize, + include_shifted_exponentials=include_shifted_exponentials, + ) From 020af1c0392297f1c96e203b3794f9dbacee5a5a Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 12/52] allow non-Path filepaths --- croissance/figures/writer.py | 2 +- croissance/formats/input.py | 2 +- croissance/formats/output.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/croissance/figures/writer.py b/croissance/figures/writer.py index f3ad4f3..ef8b774 100644 --- a/croissance/figures/writer.py +++ b/croissance/figures/writer.py @@ -8,7 +8,7 @@ class PDFWriter: def __init__(self, filepath, include_shifted_exponentials: bool = False): - self._handle = filepath.open("wb") + self._handle = open(filepath, "wb") self._doc = PdfPages(self._handle) self._include_shifted_exponentials = include_shifted_exponentials diff --git a/croissance/formats/input.py b/croissance/formats/input.py index 25b5df8..1ce52a0 100644 --- a/croissance/formats/input.py +++ b/croissance/formats/input.py @@ -6,7 +6,7 @@ def __init__(self, filepath): self._filepath = filepath def read(self): - with self._filepath.open("rt") as handle: + with open(self._filepath, "rt") as handle: data = pandas.read_csv(handle, sep="\t", header=0, index_col=0) return [(name, data[name].dropna()) for name in data.columns] diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 1aa7ae5..653ee1d 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -6,7 +6,7 @@ class TSVWriter: def __init__(self, filepath, exclude_default_phase: bool = True): self._exclude_default_phase = exclude_default_phase - self._handle = filepath.open("wt") + self._handle = open(filepath, "wt") self._writer = csv.writer( self._handle, delimiter="\t", quoting=csv.QUOTE_MINIMAL ) From 2f1305e7b5d9396f6602b2cf5e5a54585652232e Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 13/52] fix import paths and remove unused imports --- croissance/main.py | 3 +-- test.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/croissance/main.py b/croissance/main.py index 9ae9609..2dc8bb7 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import argparse -import collections import logging import sys @@ -10,7 +9,7 @@ from croissance import Estimator from croissance.estimation.util import normalize_time_unit -from croissance.figures import PDFWriter +from croissance.figures.writer import PDFWriter from croissance.formats.input import TSVReader from croissance.formats.output import TSVWriter diff --git a/test.py b/test.py index ee81438..9f71d47 100644 --- a/test.py +++ b/test.py @@ -4,10 +4,10 @@ import numpy import pandas -from croissance import process_curve, AnnotatedGrowthCurve +from croissance import process_curve from croissance.estimation import fit_exponential from croissance.estimation.util import normalize_time_unit -from croissance.figures import PDFWriter +from croissance.figures.writer import PDFWriter class CroissanceTestCase(TestCase): From 01038792d0e4f13381cc99deef6028632cd1c6c2 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 14/52] remove dead code --- croissance/estimation/__init__.py | 8 +------- croissance/estimation/smoothing/__init__.py | 13 ------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index f6f174d..8642033 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -10,14 +10,8 @@ from croissance.estimation.outliers import remove_outliers from croissance.estimation.ranking import rank_phases from croissance.estimation.regression import fit_exponential -from croissance.estimation.smoothing import median_smoothing from croissance.estimation.smoothing.segments import segment_spline_smoothing -from croissance.estimation.util import ( - savitzky_golay, - resample, - with_overhangs, - points_per_hour, -) +from croissance.estimation.util import savitzky_golay, points_per_hour class RawGrowthPhase(namedtuple("RawGrowthPhase", ("start", "end"))): diff --git a/croissance/estimation/smoothing/__init__.py b/croissance/estimation/smoothing/__init__.py index 6a4c1e6..e69de29 100644 --- a/croissance/estimation/smoothing/__init__.py +++ b/croissance/estimation/smoothing/__init__.py @@ -1,13 +0,0 @@ -import pandas - -from croissance.estimation.util import with_overhangs - - -def median_smoothing(series, window=10): - data = pandas.rolling_apply( - with_overhangs(series.values, window // 2), - window, - median_window_clean, - center=True, - )[window // 2 : -window // 2] - return pandas.Series(index=series.index, data=data) From 341835b6765c38b15307808282768ee4bd96fbe9 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 15/52] simplify ranking calculation --- croissance/estimation/ranking.py | 34 ++++++++------------------------ 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/croissance/estimation/ranking.py b/croissance/estimation/ranking.py index 5b9def5..262b5d9 100644 --- a/croissance/estimation/ranking.py +++ b/croissance/estimation/ranking.py @@ -1,38 +1,20 @@ -from operator import itemgetter - -import numpy +from operator import attrgetter def score(values, q): - return (q - numpy.min(values)) / (numpy.max(values) - numpy.min(values)) * 100 + return (q - min(values)) / (max(values) - min(values)) * 100 def rank_phases(phases, weights, thresholds): values = {} - scores = [] - for attribute in weights: values[attribute] = [getattr(phase, attribute) for phase in phases] + values[attribute].append(thresholds[attribute]) for phase in phases: - scores.append( - ( - sum( - weight - * score( - [thresholds[attribute]] + values[attribute], - getattr(phase, attribute), - ) - for attribute, weight in weights.items() - ) - / sum(weights.values()), - phase, - ) - ) - - ranked_phases = [] - for rank, phase in sorted(scores, key=itemgetter(0), reverse=True): - phase.attributes["rank"] = rank - ranked_phases.append(phase) + phase.attributes["rank"] = sum( + weight * score(values[attribute], getattr(phase, attribute)) + for attribute, weight in weights.items() + ) / sum(weights.values()) - return ranked_phases + return sorted(phases, key=attrgetter("rank"), reverse=True) From fb872b227e31faa8d43bb6459dc2143d3e1784ea Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 16/52] add start/end positions to phase table --- croissance/formats/output.py | 42 +++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 653ee1d..0900be8 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -12,32 +12,34 @@ def __init__(self, filepath, exclude_default_phase: bool = True): ) self._writer.writerow( - ["name", "phase", "slope", "intercept", "N0", "SNR", "rank"] + ["name", "phase", "start", "end", "slope", "intercept", "N0", "SNR", "rank"] ) def write(self, name: str, curve: AnnotatedGrowthCurve): if not self._exclude_default_phase: phase = GrowthPhase.pick_best(curve.growth_phases, "rank") + if phase is None: + phase = GrowthPhase(None, None, None, None, None, {}) - if not phase: - self._writer.writerow([name, 0, None, None, None]) - else: - self._writer.writerow( - [ - name, - 0, - phase.slope, - phase.intercept, - phase.n0, - phase.SNR, - phase.rank, - ] - ) - - for i, phase in enumerate(curve.growth_phases, start=1): - self._writer.writerow( - [name, i, phase.slope, phase.intercept, phase.n0, phase.SNR, phase.rank] - ) + self._write_phase(name, 0, phase) + + for idx, phase in enumerate(curve.growth_phases, start=1): + self._write_phase(name, idx, phase) + + def _write_phase(self, name, idx, phase): + self._writer.writerow( + [ + name, + idx, + phase.start, + phase.end, + phase.slope, + phase.intercept, + phase.n0, + phase.SNR, + phase.rank, + ] + ) def __enter__(self): return self From 7a9d832dd9e4c1c53ed6945990c7e5380814ef37 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 17/52] simplify GrowthPhase class Values stored in the 'attributes' dict are made proper attributes in order to get rid of __getattr__ implemention that results in inconstent behavior when accessing missing attributes. --- croissance/estimation/__init__.py | 27 +++++++++++++++++---------- croissance/estimation/ranking.py | 7 +++++-- croissance/formats/output.py | 2 +- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 8642033..9e838fc 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -1,4 +1,5 @@ import logging + from collections import namedtuple from operator import attrgetter @@ -24,7 +25,7 @@ def duration(self): class GrowthPhase( namedtuple( - "GrowthPhase", ("start", "end", "slope", "intercept", "n0", "attributes") + "GrowthPhase", ("start", "end", "slope", "intercept", "n0", "SNR", "rank") ) ): __slots__ = () @@ -33,15 +34,13 @@ class GrowthPhase( def duration(self): return self.end - self.start - @classmethod - def pick_best(cls, growth_phases, metric="duration"): - try: - return sorted(growth_phases, key=attrgetter(metric), reverse=True)[0] - except IndexError: - return None + @staticmethod + def pick_best(growth_phases, metric="duration"): + growth_phases = sorted(growth_phases, key=attrgetter(metric)) + if growth_phases: + return growth_phases[-1] - def __getattr__(self, item): - return self.attributes.get(item, None) + return None AnnotatedGrowthCurve = namedtuple( @@ -161,7 +160,15 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: continue phases.append( - GrowthPhase(phase.start, phase.end, slope, intercept, n0, {"SNR": snr}) + GrowthPhase( + start=phase.start, + end=phase.end, + slope=slope, + intercept=intercept, + n0=n0, + SNR=snr, + rank=None, + ) ) ranked_phases = rank_phases( diff --git a/croissance/estimation/ranking.py b/croissance/estimation/ranking.py index 262b5d9..5e1c498 100644 --- a/croissance/estimation/ranking.py +++ b/croissance/estimation/ranking.py @@ -11,10 +11,13 @@ def rank_phases(phases, weights, thresholds): values[attribute] = [getattr(phase, attribute) for phase in phases] values[attribute].append(thresholds[attribute]) + ranked_phases = [] for phase in phases: - phase.attributes["rank"] = sum( + rank = sum( weight * score(values[attribute], getattr(phase, attribute)) for attribute, weight in weights.items() ) / sum(weights.values()) - return sorted(phases, key=attrgetter("rank"), reverse=True) + ranked_phases.append(phase._replace(rank=rank)) + + return sorted(ranked_phases, key=attrgetter("rank"), reverse=True) diff --git a/croissance/formats/output.py b/croissance/formats/output.py index 0900be8..5d3856b 100644 --- a/croissance/formats/output.py +++ b/croissance/formats/output.py @@ -19,7 +19,7 @@ def write(self, name: str, curve: AnnotatedGrowthCurve): if not self._exclude_default_phase: phase = GrowthPhase.pick_best(curve.growth_phases, "rank") if phase is None: - phase = GrowthPhase(None, None, None, None, None, {}) + phase = GrowthPhase(None, None, None, None, None, None, None) self._write_phase(name, 0, phase) From 58aceabf3a677dc7a8c3baa2b4b7edc6931fae84 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 18/52] simple parallelization of growth rate estimations --- croissance/estimation/__init__.py | 14 +++--- croissance/main.py | 77 +++++++++++++++++++++++-------- 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 9e838fc..14cda3c 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -91,7 +91,9 @@ def _find_growth_phases( return phases - def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: + def growth( + self, curve: pandas.Series, name: str = "untitled curve" + ) -> AnnotatedGrowthCurve: series = curve.dropna() n_hours = int( @@ -99,7 +101,7 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: ) if n_hours == 0: - self._log.warning("Less than 1 data-point/hour; cannot proceed") + self._log.warning("Fewer than one Data-points/hour for %s", name) return AnnotatedGrowthCurve(series, [], []) if n_hours % 2 == 0: @@ -109,20 +111,18 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve: # NOTE workaround for issue with negative curves if len(series[series > 0]) < 3: - self._log.warning("Less than 3 data-points post filtering; cannot proceeed") + self._log.warning("Fewer than three positive data-points for %s", name) return AnnotatedGrowthCurve(series, outliers, []) if self._segment_log_n0: series_log_n0 = numpy.log(series - self._n0).dropna() smooth_series = segment_spline_smoothing(series, series_log_n0) else: - smooth_series = segment_spline_smoothing( - series, - ) + smooth_series = segment_spline_smoothing(series) phases = [] if len(smooth_series) < n_hours: - self._log.warning("Insufficient data, cannot determine growth") + self._log.warning("Insufficient smoothed data for %s", name) return AnnotatedGrowthCurve(series, [], []) raw_phases = self._find_growth_phases(smooth_series, window=n_hours) diff --git a/croissance/main.py b/croissance/main.py index 2dc8bb7..2707029 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import argparse import logging +import multiprocessing +import signal import sys from pathlib import Path @@ -14,6 +16,30 @@ from croissance.formats.output import TSVWriter +class EstimatorWrapper: + def __init__(self, args): + self.estimator = Estimator( + constrain_n0=args.constrain_N0, + segment_log_n0=args.segment_log_N0, + n0=args.N0, + ) + + self.input_time_unit = args.input_time_unit + + def __call__(self, values): + filepath, idx, name, curve = values + + normalized_curve = normalize_time_unit(curve, self.input_time_unit) + annotated_curve = self.estimator.growth(normalized_curve, name) + + return (filepath, idx, name, annotated_curve) + + +def init_worker(): + # Ensure that KeyboardInterrupt exceptions only occur in the main thread + signal.signal(signal.SIGINT, signal.SIG_IGN) + + def parse_args(argv): parser = argparse.ArgumentParser( description="Estimate growth rates in growth curves", @@ -59,6 +85,13 @@ def parse_args(argv): help="Renders a PDF file with figures for each curve", ) + parser.add_argument( + "--threads", + type=int, + default=1, + help="Max number of threads to use during growth estimation", + ) + group = parser.add_argument_group("Logging") group.add_argument( "--log-level", @@ -84,39 +117,45 @@ def main(argv): args = parse_args(argv) log = setup_logging(level=args.log_level) - estimator = Estimator( - constrain_n0=args.constrain_N0, - segment_log_n0=args.segment_log_N0, - n0=args.N0, - ) - - log.info("Estimating growth curves ..") + curves = [] for filepath in args.infiles: log.info("Reading curves from '%s", filepath) with TSVReader(filepath) as reader: - curves = reader.read() + for idx, (name, curve) in enumerate(reader.read()): + if curve.empty: + log.warning("Skipping empty curve %r", name) + continue + + curves.append((filepath, idx, name, curve)) + log.info("Collected a total of %i growth curves", len(curves)) - annotated_curves = {} - for name, curve in curves: - if curve.empty: - log.warning("Skipping empty curve %r", name) - continue + # Dont spawn more processes than tasks + args.threads = max(1, min(args.threads, len(curves))) - log.info("Estimating growth for curve %r", name) - annotated_curves[name] = estimator.growth( - normalize_time_unit(curve, args.input_time_unit) - ) + log.info("Annotating growth curves ..") + annotated_curves = {filepath: [] for filepath in args.infiles} + with multiprocessing.Pool(processes=args.threads, initializer=init_worker) as pool: + async_calculation = pool.imap_unordered(EstimatorWrapper(args), curves) + for nth, (filepath, idx, name, curve) in enumerate(async_calculation, start=1): + log.info("Annotated curve %i of %i: %s", nth, len(curves), name) + + annotated_curves[filepath].append((idx, name, curve)) + + for filepath in args.infiles: output_filepath = filepath.with_suffix(args.output_suffix + ".tsv") + log.info("Writing annotated curves to '%s'", output_filepath) + with TSVWriter(output_filepath, args.output_exclude_default_phase) as outwriter: - for name, annotated_curve in annotated_curves.items(): + for _, name, annotated_curve in sorted(annotated_curves[filepath]): outwriter.write(name, annotated_curve) if args.figures: figure_filepath = filepath.with_suffix(args.output_suffix + ".pdf") + log.info("Writing PDFs to '%s'", figure_filepath) with PDFWriter(figure_filepath) as figwriter: - for name, annotated_curve in annotated_curves.items(): + for _, name, annotated_curve in sorted(annotated_curves[filepath]): figwriter.write(name, annotated_curve) log.info("Done ..") From 7b42feadb77c64821893038637eb9b2637d5ef2b Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 19/52] fix outliers not being returned in one case --- croissance/estimation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 14cda3c..f6daee9 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -123,7 +123,7 @@ def growth( phases = [] if len(smooth_series) < n_hours: self._log.warning("Insufficient smoothed data for %s", name) - return AnnotatedGrowthCurve(series, [], []) + return AnnotatedGrowthCurve(series, outliers, []) raw_phases = self._find_growth_phases(smooth_series, window=n_hours) for phase in raw_phases: From a4830bdd9c9db39808861dfc111fc0fd3d65655e Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 20/52] simplify outlier masking --- croissance/estimation/outliers.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/croissance/estimation/outliers.py b/croissance/estimation/outliers.py index 0b99c82..0d4ed8e 100644 --- a/croissance/estimation/outliers.py +++ b/croissance/estimation/outliers.py @@ -5,28 +5,16 @@ def remove_outliers(series, window=30, std=2): """ - Removes any points where the distance of the median exceeds ``std`` standard deviations within a rolling window. - - :param series: - :param window: - :param std: - :return: + Returns a tuple containing a series with points removed where the distance of the + median exceeds ``std`` standard deviations within a rolling window, and a series + containing the removed outliers. """ if len(series.values) < 10: - return series, pandas.Series(data=[], index=[]) + return series, pandas.Series([], dtype="float64") values = with_overhangs(series.values, window) - outliers = ( - abs(values - values.rolling(window=window, center=True).median()) - < values.rolling(window=window, center=True).std() * std - ) + windows = values.rolling(window=window, center=True) + outliers = abs(values - windows.median()) >= windows.std() * std outlier_mask = outliers[window:-window].values - outliers = pandas.Series( - data=series.values[~outlier_mask], index=series.index[~outlier_mask] - ) - series = pandas.Series( - data=series.values[outlier_mask], index=series.index[outlier_mask] - ) - - return series, outliers + return series[~outlier_mask], series[outlier_mask] From 2a51af0333cf1228128834c56e5e84e173e0f05e Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 21/52] added hint for times that might be in minutes --- croissance/estimation/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index f6daee9..f839580 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -101,7 +101,11 @@ def growth( ) if n_hours == 0: - self._log.warning("Fewer than one Data-points/hour for %s", name) + self._log.warning( + "Fewer than one data-point per hour for %s. Use the command-line" + "`--input-time-unit minutes` if times are represented in minutes.", + name, + ) return AnnotatedGrowthCurve(series, [], []) if n_hours % 2 == 0: From bcae68feebc72131f8d7225128bab12b094509c6 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 22/52] outliers should always be a Series This fixes errors caused by code assuming this to be the case. --- croissance/estimation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index f839580..6a8c650 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -106,7 +106,7 @@ def growth( "`--input-time-unit minutes` if times are represented in minutes.", name, ) - return AnnotatedGrowthCurve(series, [], []) + return AnnotatedGrowthCurve(series, pandas.Series(dtype="float64"), []) if n_hours % 2 == 0: n_hours += 1 From 4e9bb82020b3432f60a7a58a3416ef7b5f1d3fda Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 23/52] remove unused functionality --- croissance/__init__.py | 12 ++---------- croissance/figures/plot.py | 24 +----------------------- croissance/figures/writer.py | 9 ++------- 3 files changed, 5 insertions(+), 40 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index 92096db..10b7fef 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -28,15 +28,7 @@ def process_curve( ).growth(curve) -def plot_processed_curve( - curve: AnnotatedGrowthCurve, - name: str = None, - figsize=(10, 10), - include_shifted_exponentials: bool = False, -): +def plot_processed_curve(curve: AnnotatedGrowthCurve, name=None, figsize=(10, 10)): return croissance.figures.plot.plot_processed_curve( - curve=curve, - name=name, - figsize=figsize, - include_shifted_exponentials=include_shifted_exponentials, + curve=curve, name=name, figsize=figsize ) diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py index fafa370..5630940 100644 --- a/croissance/figures/plot.py +++ b/croissance/figures/plot.py @@ -4,12 +4,7 @@ from croissance.estimation import AnnotatedGrowthCurve -def plot_processed_curve( - curve: AnnotatedGrowthCurve, - name: str, - figsize=(16, 16), - include_shifted_exponentials: bool = False, -): +def plot_processed_curve(curve: AnnotatedGrowthCurve, name: str, figsize=(16, 16)): fig, axes = plt.subplots(nrows=2, ncols=1, figsize=figsize) axes[1].set_yscale("log") @@ -51,23 +46,6 @@ def gf(x): phase_series = curve.series[phase.start : phase.end] for axis in axes: - if include_shifted_exponentials: - axis.plot( - phase_series.index, - phase_series.values - phase.n0, - marker=".", - linewidth=0, - color="red", - ) - - axis.plot( - phase_series.index, - gf(phase_series.index) - phase.n0, - marker=None, - linewidth=2, - color="red", - ) - axis.axhline( y=phase.n0, marker=None, diff --git a/croissance/figures/writer.py b/croissance/figures/writer.py index ef8b774..afb019c 100644 --- a/croissance/figures/writer.py +++ b/croissance/figures/writer.py @@ -7,17 +7,12 @@ class PDFWriter: - def __init__(self, filepath, include_shifted_exponentials: bool = False): + def __init__(self, filepath): self._handle = open(filepath, "wb") self._doc = PdfPages(self._handle) - self._include_shifted_exponentials = include_shifted_exponentials def write(self, name: str, curve: AnnotatedGrowthCurve): - fig, _axes = plot_processed_curve( - curve=curve, - name=name, - include_shifted_exponentials=self._include_shifted_exponentials, - ) + fig, _axes = plot_processed_curve(curve=curve, name=name) try: self._doc.savefig(fig) From 83aa990d3b9832ac9e4beccfae34b7e54355d79c Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 24/52] add missing whitespace to log message --- croissance/estimation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 6a8c650..464f74a 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -102,7 +102,7 @@ def growth( if n_hours == 0: self._log.warning( - "Fewer than one data-point per hour for %s. Use the command-line" + "Fewer than one data-point per hour for %s. Use the command-line " "`--input-time-unit minutes` if times are represented in minutes.", name, ) From 2de4c23dbd0e8f48ebafaadd3cd0de95baf2453e Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 25/52] cleanup and handle edge cases in `points_per_hour` --- croissance/estimation/util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/croissance/estimation/util.py b/croissance/estimation/util.py index 63adae7..7b6fcd4 100644 --- a/croissance/estimation/util.py +++ b/croissance/estimation/util.py @@ -5,9 +5,10 @@ def points_per_hour(series): - return 1 / numpy.median( - [series.index[i] - series.index[i - 1] for i in range(1, len(series))] - ) + if len(series) < 2: + return len(series) + + return 1 / numpy.median(series.index[1:] - series.index[:-1]) def resample(series, *, factor=10, size=None): From 41b376698c7833d6186aa1f586742703bec6e440 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 26/52] fix division-by-0 in `score` for identical values This was observed for curve with a single phase with duration = 1.5, but it could theoretically occur for any attribute and any number of phases. --- croissance/estimation/ranking.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/croissance/estimation/ranking.py b/croissance/estimation/ranking.py index 5e1c498..e1d7fd2 100644 --- a/croissance/estimation/ranking.py +++ b/croissance/estimation/ranking.py @@ -2,7 +2,11 @@ def score(values, q): - return (q - min(values)) / (max(values) - min(values)) * 100 + span = max(values) - min(values) + if not span: + return 100 + + return (q - min(values)) / span * 100 def rank_phases(phases, weights, thresholds): From 6a9a741d28a89b1a62219b004e63d8c6a4aee607 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 27/52] catch and log errors in worker threads --- croissance/main.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/croissance/main.py b/croissance/main.py index 2707029..2148494 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -4,6 +4,7 @@ import multiprocessing import signal import sys +import traceback from pathlib import Path @@ -29,10 +30,16 @@ def __init__(self, args): def __call__(self, values): filepath, idx, name, curve = values - normalized_curve = normalize_time_unit(curve, self.input_time_unit) - annotated_curve = self.estimator.growth(normalized_curve, name) + try: + normalized_curve = normalize_time_unit(curve, self.input_time_unit) + annotated_curve = self.estimator.growth(normalized_curve, name) - return (filepath, idx, name, annotated_curve) + return (filepath, idx, name, annotated_curve) + except Exception: + log = logging.getLogger("croissance") + log.error("Unhandled exception while annotating %r:", name) + for line in traceback.format_exc().splitlines(): + log.error("%s", line) def init_worker(): @@ -131,13 +138,19 @@ def main(argv): # Dont spawn more processes than tasks args.threads = max(1, min(args.threads, len(curves))) + log.info("Annotating growth curves using %i threads", args.threads) - log.info("Annotating growth curves ..") + return_code = 0 annotated_curves = {filepath: [] for filepath in args.infiles} with multiprocessing.Pool(processes=args.threads, initializer=init_worker) as pool: async_calculation = pool.imap_unordered(EstimatorWrapper(args), curves) - for nth, (filepath, idx, name, curve) in enumerate(async_calculation, start=1): + for nth, result in enumerate(async_calculation, start=1): + if result is None: + return_code = 1 + continue + + (filepath, idx, name, curve) = result log.info("Annotated curve %i of %i: %s", nth, len(curves), name) annotated_curves[filepath].append((idx, name, curve)) @@ -160,6 +173,8 @@ def main(argv): log.info("Done ..") + return return_code + def entry_point(): sys.exit(main(sys.argv[1:])) From 663886a351ddc36760488f2cf948d3bba61c0f81 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 28/52] add option for controlling y-axis scale The option allows for output plots with linear, logorithmic, or both types of scales for y-axies. --- croissance/figures/plot.py | 145 ++++++++++++++++++----------------- croissance/figures/writer.py | 8 +- croissance/main.py | 9 ++- 3 files changed, 90 insertions(+), 72 deletions(-) diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py index 5630940..f2e45c2 100644 --- a/croissance/figures/plot.py +++ b/croissance/figures/plot.py @@ -4,34 +4,44 @@ from croissance.estimation import AnnotatedGrowthCurve -def plot_processed_curve(curve: AnnotatedGrowthCurve, name: str, figsize=(16, 16)): - fig, axes = plt.subplots(nrows=2, ncols=1, figsize=figsize) - - axes[1].set_yscale("log") - +def plot_processed_curve(curve: AnnotatedGrowthCurve, name: str, yscale="log"): + fig, axes = plt.subplots(nrows=2 if yscale == "both" else 1, ncols=1) fig.suptitle(name) if curve.series.max() <= 0: return - for axis in axes: - axis.plot( - curve.series.index, - curve.series.values, - color="black", - marker=".", - markersize=5, - linestyle="None", - ) - - axis.plot( - curve.outliers.index, - curve.outliers.values, - color="red", - marker=".", - markersize=5, - linestyle="None", - ) + if yscale == "both": + _do_plot_processed_curve(curve, axes[0], "linear") + _do_plot_processed_curve(curve, axes[1], "log") + + return fig, axes + elif yscale in ("log", "linear"): + _do_plot_processed_curve(curve, axes, yscale) + + return fig, [axes] + else: + raise ValueError(yscale) + + +def _do_plot_processed_curve(curve, axis, yscale): + axis.plot( + curve.series.index, + curve.series.values, + color="black", + marker=".", + markersize=5, + linestyle="None", + ) + + axis.plot( + curve.outliers.index, + curve.outliers.values, + color="red", + marker=".", + markersize=5, + linestyle="None", + ) colors = ["b", "g", "c", "m", "y", "k"] @@ -45,49 +55,46 @@ def gf(x): phase_series = curve.series[phase.start : phase.end] - for axis in axes: - axis.axhline( - y=phase.n0, - marker=None, - linewidth=1, - linestyle="dashed", - color=color, - ) - axis.axvline( - x=phase.intercept, - marker=None, - linewidth=1, - linestyle="dashed", - color=color, - ) - - axis.plot( - phase_series.index, - phase_series.values, - marker=None, - linewidth=15, - color=color, - solid_capstyle="round", - alpha=0.2, - ) - - axis.plot( - curve.series.index, - gf(curve.series.index), - color=color, - linewidth=1, - ) - axis.plot( - phase_series.index, - gf(phase_series.index), - color=color, - linewidth=2, - ) - - for axis in axes: - axis.set_xlim(curve.series.index[0], curve.series.index[-1]) - - axes[0].set_ylim([max(curve.series.min(), -2.5), curve.series.max()]) - axes[1].set_ylim([0.1, curve.series.max()]) - - return fig, axes + axis.axhline( + y=phase.n0, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + axis.axvline( + x=phase.intercept, + marker=None, + linewidth=1, + linestyle="dashed", + color=color, + ) + + axis.axhline(y=phase.n0, linewidth=1, linestyle="dashed", color=color) + axis.axvline(x=phase.intercept, linewidth=1, linestyle="dashed", color=color) + + axis.plot( + phase_series.index, + phase_series.values, + marker=None, + linewidth=15, + color=color, + solid_capstyle="round", + alpha=0.2, + ) + + axis.plot(curve.series.index, gf(curve.series.index), color=color, linewidth=1) + axis.plot(phase_series.index, gf(phase_series.index), color=color, linewidth=2) + + axis.set_xlim(curve.series.index[0], curve.series.index[-1]) + + yvalues = curve.series + ymargin = (yvalues.max() - yvalues.min()) * 0.025 + yvalues_min = 0.01 if yscale == "log" else float("-inf") + axis.set_ylim([max(yvalues.min() - ymargin, yvalues_min), yvalues.max() + ymargin]) + + xvalues = curve.series.index + xmargin = (xvalues.max() - xvalues.min()) * 0.025 + axis.set_xlim([xvalues.min() - xmargin, xvalues.max() + xmargin]) + + axis.set_yscale(yscale) diff --git a/croissance/figures/writer.py b/croissance/figures/writer.py index afb019c..09b7863 100644 --- a/croissance/figures/writer.py +++ b/croissance/figures/writer.py @@ -7,12 +7,16 @@ class PDFWriter: - def __init__(self, filepath): + def __init__(self, filepath, yscale="both"): self._handle = open(filepath, "wb") self._doc = PdfPages(self._handle) + self._yscale = yscale def write(self, name: str, curve: AnnotatedGrowthCurve): - fig, _axes = plot_processed_curve(curve=curve, name=name) + fig, _axes = plot_processed_curve(curve=curve, name=name, yscale=self._yscale) + + fig.set_figwidth(16) + fig.set_figheight(16 if self._yscale == "both" else 8) try: self._doc.savefig(fig) diff --git a/croissance/main.py b/croissance/main.py index 2148494..fa6c1c8 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -91,6 +91,13 @@ def parse_args(argv): action="store_true", help="Renders a PDF file with figures for each curve", ) + parser.add_argument( + "--figures-yscale", + type=str.lower, + default="both", + choices=("log", "linear", "both"), + help="Yscale(s) for figures. If both, then two plots are generated per curve", + ) parser.add_argument( "--threads", @@ -167,7 +174,7 @@ def main(argv): figure_filepath = filepath.with_suffix(args.output_suffix + ".pdf") log.info("Writing PDFs to '%s'", figure_filepath) - with PDFWriter(figure_filepath) as figwriter: + with PDFWriter(figure_filepath, yscale=args.figures_yscale) as figwriter: for _, name, annotated_curve in sorted(annotated_curves[filepath]): figwriter.write(name, annotated_curve) From 794c2472e82a30dd81751f7af4bb1ffea2a9fd7b Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 29/52] remove extranous set_xlim --- croissance/figures/plot.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py index f2e45c2..dfb2e68 100644 --- a/croissance/figures/plot.py +++ b/croissance/figures/plot.py @@ -86,8 +86,6 @@ def gf(x): axis.plot(curve.series.index, gf(curve.series.index), color=color, linewidth=1) axis.plot(phase_series.index, gf(phase_series.index), color=color, linewidth=2) - axis.set_xlim(curve.series.index[0], curve.series.index[-1]) - yvalues = curve.series ymargin = (yvalues.max() - yvalues.min()) * 0.025 yvalues_min = 0.01 if yscale == "log" else float("-inf") From 08e6e7a9947f003ee3ec64a59a12bbaa1118980b Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 30/52] lints --- croissance/estimation/__init__.py | 5 ----- croissance/estimation/regression.py | 6 +++--- croissance/estimation/smoothing/segments.py | 22 ++++++++------------- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 464f74a..bb0b2bb 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -141,11 +141,6 @@ def growth( if phase.duration < defaults.PHASE_MINIMUM_DURATION_HOURS: continue - # snr_estimate, slope_estimate = signal_noise_ratio_estimate(phase_series.values) - # # skip any growth phase with an estimated S/N ratio < 1 or negative slope. - # if slope_estimate <= 0: - # continue - if self._constrain_n0: slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( phase_series, n0=self._n0 diff --git a/croissance/estimation/regression.py b/croissance/estimation/regression.py index 833b15f..3c02793 100644 --- a/croissance/estimation/regression.py +++ b/croissance/estimation/regression.py @@ -9,9 +9,9 @@ def exponential(x, a, b, c): def fit_exponential(series, *, p0=(1.0, 0.01, 0.0), n0: float = None): """ - Fits an exponential to a series. First attempts an exponential fit in linear space using p0, then falls back to a - fit in log space to attempt to find parameters p0 for a linear fit; if all else fails returns the linear fit. - + Fits an exponential to a series. First attempts an exponential fit in linear space + using p0, then falls back to a fit in log space to attempt to find parameters p0 + for a linear fit; if all else fails returns the linear fit. """ if n0 is None: diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index da31560..6b27487 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -8,13 +8,9 @@ def segment_by_std_dev(series, increment=2, maximum=20): """ - Divides a series into segments, minimizing standard deviation over window size. Windows are of varying size from - `increment` to `maximum * increment` at each offset `increment` within the series. - - :param series: - :param increment: - :param maximum: - :return: + Divides a series into segments, minimizing standard deviation over window size. + Windows are of varying size from `increment` to `maximum * increment` at each + offset `increment` within the series. """ start = int(series.index.min()) duration = int(series.index[-2]) @@ -58,15 +54,13 @@ def window_median(window, start, end): def segment_points(series, segments): """ - Picks knot points for an interpolating spline along a series of segments according to these rules: + Picks knot points for an interpolating spline along a series of segments according + to these rules: - For small segments, add a knot in the center of the segment - - For medium-sized segments, add a knot each near the beginning and end of the segment - - For large segments, add a knot a knot each near the beginning and end of the segment, and one in the center. - - :param series: - :param segments: - :return: + - For medium-sized segments, add a knot near the beginning and end of the segment + - For large segments, add a knot a knot near the beginning and end of the segment, + and one in the center. """ out = [(series.index[0], numpy.median(series[: series.index[0] + 1]))] From 875b0e48cfcf73d2adebe07bb886ecd7ad247d30 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 31/52] small simplification --- croissance/estimation/__init__.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index bb0b2bb..d6bd8da 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -58,8 +58,7 @@ def __init__( ): self._log = logging.getLogger(__name__) self._segment_log_n0 = segment_log_n0 - self._constrain_n0 = constrain_n0 - self._n0 = n0 + self._n0 = n0 if constrain_n0 else None def _find_growth_phases( self, curve: "pandas.Series", window @@ -141,14 +140,9 @@ def growth( if phase.duration < defaults.PHASE_MINIMUM_DURATION_HOURS: continue - if self._constrain_n0: - slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( - phase_series, n0=self._n0 - ) - else: - slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( - phase_series - ) + slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( + phase_series, n0=self._n0 + ) # skip phases whose actual slope is below the limit if slope < max(0.0, defaults.PHASE_MINIMUM_SLOPE): From 1a063e080d66bd6a55dabd830a59a6c89b98de7a Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 32/52] remove dead code --- croissance/estimation/util.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/croissance/estimation/util.py b/croissance/estimation/util.py index 7b6fcd4..9767cf3 100644 --- a/croissance/estimation/util.py +++ b/croissance/estimation/util.py @@ -1,6 +1,5 @@ import numpy import pandas -from scipy.interpolate import InterpolatedUnivariateSpline from scipy.signal import savgol_filter @@ -11,26 +10,6 @@ def points_per_hour(series): return 1 / numpy.median(series.index[1:] - series.index[:-1]) -def resample(series, *, factor=10, size=None): - """ - Returns a new series re-sampled to a given number of points. - - :param series: - :param factor: a number of points per unit time to scale the series to. - :param size: a number of points to scale the series to. - :return: - """ - series = series.dropna() - start, end = series.index[0], series.index[-1] - - if size is None: - size = (end - start) * factor - - index = numpy.linspace(start, end, size) - spline = InterpolatedUnivariateSpline(series.index, series.values) - return pandas.Series(index=index, data=spline(index)) - - def savitzky_golay(series, *args, **kwargs): return pandas.Series( index=series.index, data=savgol_filter(series.values, *args, **kwargs) From 7b28d91056759123660890d7ab987376835b0d77 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 14:22:48 +0100 Subject: [PATCH 33/52] improved plot limits using auto-calculated ranges --- croissance/figures/plot.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py index dfb2e68..cc35d05 100644 --- a/croissance/figures/plot.py +++ b/croissance/figures/plot.py @@ -43,6 +43,11 @@ def _do_plot_processed_curve(curve, axis, yscale): linestyle="None", ) + # Lock x/y limits to those auto-calculated prior to the addition of growth curves + axis.set_yscale(yscale) + axis.set_xlim(axis.get_xlim()) + axis.set_ylim(axis.get_ylim()) + colors = ["b", "g", "c", "m", "y", "k"] for i, phase in enumerate(curve.growth_phases): @@ -85,14 +90,3 @@ def gf(x): axis.plot(curve.series.index, gf(curve.series.index), color=color, linewidth=1) axis.plot(phase_series.index, gf(phase_series.index), color=color, linewidth=2) - - yvalues = curve.series - ymargin = (yvalues.max() - yvalues.min()) * 0.025 - yvalues_min = 0.01 if yscale == "log" else float("-inf") - axis.set_ylim([max(yvalues.min() - ymargin, yvalues_min), yvalues.max() + ymargin]) - - xvalues = curve.series.index - xmargin = (xvalues.max() - xvalues.min()) * 0.025 - axis.set_xlim([xvalues.min() - xmargin, xvalues.max() + xmargin]) - - axis.set_yscale(yscale) From 27f9567d0ff899cd45557884109fd7bfb5f75b81 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 15:49:25 +0100 Subject: [PATCH 34/52] handle gappy data in `segment_by_std_dev` This prevents divisions by zero when calling `detrend`. --- croissance/estimation/smoothing/segments.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index 6b27487..4dd9751 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -18,10 +18,13 @@ def segment_by_std_dev(series, increment=2, maximum=20): for i in range(start, duration, increment): for size in range(1, maximum + 1): - window = detrend(series[i : i + size * increment]) - heappush( - windows, (window.std() / (size * increment), i, i + size * increment) - ) + window = series[i : i + size * increment] + if not window.empty: + window = detrend(window) + heappush( + windows, + (window.std() / (size * increment), i, i + size * increment), + ) segments = [] spots = set() From 61f006269a4b3fbb7663035cf72e8f4af610e312 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 15:50:59 +0100 Subject: [PATCH 35/52] simplify `segment_by_std_dev` --- croissance/estimation/smoothing/segments.py | 32 +++++++-------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index 4dd9751..5d12e07 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -1,7 +1,6 @@ -from heapq import heappush, heappop - import numpy import pandas + from scipy.signal import detrend from scipy.interpolate import InterpolatedUnivariateSpline @@ -14,37 +13,28 @@ def segment_by_std_dev(series, increment=2, maximum=20): """ start = int(series.index.min()) duration = int(series.index[-2]) - windows = [] + windows = [] for i in range(start, duration, increment): for size in range(1, maximum + 1): window = series[i : i + size * increment] + # Gaps in measurements may result in empty windows if not window.empty: window = detrend(window) - heappush( - windows, - (window.std() / (size * increment), i, i + size * increment), + windows.append( + (window.std() / (size * increment), i, i + size * increment) ) segments = [] spots = set() + for _window_agv_std, start, end in sorted(windows): + window_spots = range(start, int(end)) - try: - while True: - _window_agv_std, start, end = heappop(windows) - - if any(i in spots for i in range(start, int(end))): - continue - - for i in range(start, int(end)): - spots.add(int(i)) - - heappush(segments, (start, min(duration, end))) - - except IndexError: - pass + if not any(i in spots for i in window_spots): + segments.append((start, min(duration, end))) + spots.update(window_spots) - return [heappop(segments) for _ in range(len(segments))] + return sorted(segments) def window_median(window, start, end): From ce4505627bf5d32be2fa5e82b6b64f3edbd20dad Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 15:53:15 +0100 Subject: [PATCH 36/52] improve handling of gappy data in `segment_points` This prevents subsequent interpolation from returning a series of NaNs. --- croissance/estimation/smoothing/segments.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index 5d12e07..f57fc23 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -57,23 +57,21 @@ def segment_points(series, segments): """ out = [(series.index[0], numpy.median(series[: series.index[0] + 1]))] - for start, end in segments: + def _add_knot(start, end): window = series[start:end] + if not window.empty: + out.append(window_median(window, start, end)) - if window.empty: - continue - + for start, end in segments: if end - start > 5: - out.append(window_median(series[start : start + 2], start, start + 2)) + _add_knot(start, start + 2) if end - start > 11: - out.append( - window_median(series[start + 2 : end - 2], start + 2, end - 2) - ) + _add_knot(start + 2, end - 2) - out.append(window_median(series[end - 2 : end], end - 2, end)) + _add_knot(end - 2, end) else: - out.append(window_median(window, start, end)) + _add_knot(start, end) out += [(series.index[-1], numpy.max(series[series.index[0] - 1 :]))] From 2bc9fbc35d721a6dcba098977416d904e029e5fc Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Tue, 24 Nov 2020 17:13:39 +0100 Subject: [PATCH 37/52] update Travis config --- .travis.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0a1cd2f..213d14d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,10 @@ cache: python: - "3.5" before_install: - - | + - | mkdir -p $HOME/.config/matplotlib && \ echo 'backend: agg' > $HOME/.config/matplotlib/matplotlibrc install: - - "pip install ." -script: python setup.py test + - python setup.py install + - pip install pytest +script: pytest From 95b4a4cb0a41cbd0bbe759e73d8f45ed8cc6af1b Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:38:14 +0100 Subject: [PATCH 38/52] collect coverage when running tests --- tox.ini | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index a86f9b8..f427e36 100644 --- a/tox.ini +++ b/tox.ini @@ -8,8 +8,13 @@ envlist = py3 [testenv] commands = - pytest test.py + pytest test.py \ + --cov croissance \ + --no-cov-on-fail \ + --cov-branch \ + --cov-report=term-missing:skip-covered deps = pytest + pytest-cov nose >= 1.1.2 From 3b3482d957e816a0d8916146632a1fc7ca226988 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:38:43 +0100 Subject: [PATCH 39/52] simplify plotting API --- croissance/__init__.py | 6 ++---- croissance/figures/plot.py | 4 +--- croissance/figures/writer.py | 3 ++- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index 10b7fef..2ffbe9d 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -28,7 +28,5 @@ def process_curve( ).growth(curve) -def plot_processed_curve(curve: AnnotatedGrowthCurve, name=None, figsize=(10, 10)): - return croissance.figures.plot.plot_processed_curve( - curve=curve, name=name, figsize=figsize - ) +def plot_processed_curve(curve: AnnotatedGrowthCurve, yscale="both"): + return croissance.figures.plot.plot_processed_curve(curve=curve, yscale=yscale) diff --git a/croissance/figures/plot.py b/croissance/figures/plot.py index cc35d05..f3e5256 100644 --- a/croissance/figures/plot.py +++ b/croissance/figures/plot.py @@ -4,10 +4,8 @@ from croissance.estimation import AnnotatedGrowthCurve -def plot_processed_curve(curve: AnnotatedGrowthCurve, name: str, yscale="log"): +def plot_processed_curve(curve: AnnotatedGrowthCurve, yscale="log"): fig, axes = plt.subplots(nrows=2 if yscale == "both" else 1, ncols=1) - fig.suptitle(name) - if curve.series.max() <= 0: return diff --git a/croissance/figures/writer.py b/croissance/figures/writer.py index 09b7863..47a0fd5 100644 --- a/croissance/figures/writer.py +++ b/croissance/figures/writer.py @@ -13,8 +13,9 @@ def __init__(self, filepath, yscale="both"): self._yscale = yscale def write(self, name: str, curve: AnnotatedGrowthCurve): - fig, _axes = plot_processed_curve(curve=curve, name=name, yscale=self._yscale) + fig, _axes = plot_processed_curve(curve=curve, yscale=self._yscale) + fig.suptitle(name) fig.set_figwidth(16) fig.set_figheight(16 if self._yscale == "both" else 8) From 7b5031674c8eeb9ac22fa263461df68121dd34c9 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:39:45 +0100 Subject: [PATCH 40/52] reorganize unit tests --- test.py | 159 ---------------------------------- tests/__init__.py | 0 tests/api_test.py | 157 +++++++++++++++++++++++++++++++++ tests/estimation/__init__.py | 0 tests/estimation/main_test.py | 19 ++++ tests/estimation/util_test.py | 11 +++ tox.ini | 2 +- 7 files changed, 188 insertions(+), 160 deletions(-) delete mode 100644 test.py create mode 100644 tests/__init__.py create mode 100644 tests/api_test.py create mode 100644 tests/estimation/__init__.py create mode 100644 tests/estimation/main_test.py create mode 100644 tests/estimation/util_test.py diff --git a/test.py b/test.py deleted file mode 100644 index 9f71d47..0000000 --- a/test.py +++ /dev/null @@ -1,159 +0,0 @@ -from pathlib import Path -from unittest import TestCase - -import numpy -import pandas - -from croissance import process_curve -from croissance.estimation import fit_exponential -from croissance.estimation.util import normalize_time_unit -from croissance.figures.writer import PDFWriter - - -class CroissanceTestCase(TestCase): - def test_regression_basic(self): - for mu in (0.001, 0.10, 0.15, 0.50, 1.0): - phase = pandas.Series( - data=[numpy.exp(i * mu) for i in range(20)], - index=[i for i in range(20)], - ) - - slope, intercept, n0, snr, lin = fit_exponential(phase) - print(slope, intercept, n0, snr, lin) - - self.assertFalse(lin, "This fit should not require a linear fallback.") - self.assertAlmostEqual(n0, 0, 2, msg="N0=0") - self.assertAlmostEqual(intercept, 0, 2, msg='"intercept"=0') - self.assertAlmostEqual(mu, slope, 6, msg="growth rate (mu)={}".format(mu)) - self.assertTrue(snr > 100000, "signal-noise ratio should be very good") - - def test_process_curve_empty_series(self): - result = process_curve(pandas.Series([], dtype="float64")) - - self.assertTrue(result.series.empty) - self.assertEqual(result.outliers, []) - self.assertEqual(result.growth_phases, []) - - def test_process_curve_null_series(self): - curve = pandas.Series(index=[1, 2], data=[None, float("NaN")]) - result = process_curve(curve) - - self.assertIs(result.series, curve) - self.assertEqual(result.outliers, []) - self.assertEqual(result.growth_phases, []) - - def test_process_curve_time_unit(self): - mu = 0.5 - pph = 4.0 - curve = pandas.Series( - data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)], - ) - - hours = process_curve(curve, unit="hours") - minutes = process_curve( - pandas.Series(index=curve.index * 60.0, data=curve.values), - unit="minutes", - ) - - self.assertTrue(hours.series.equals(minutes.series)) - self.assertTrue(hours.outliers.equals(minutes.outliers)) - self.assertEqual(hours.growth_phases, minutes.growth_phases) - - def test_process_curve_basic(self): - with PDFWriter(Path("test.basic.pdf")) as doc: - mu = 0.5 - pph = 4.0 - curve = pandas.Series( - data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)], - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - - # self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) - - doc.write("#0 n0=1", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#0 n0=auto", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - curve = pandas.Series( - data=( - [1.0] * 5 - + [numpy.exp(mu * i / pph) for i in range(25)] - + [numpy.exp(mu * 24 / pph)] * 20 - ), - index=([i / pph for i in range(50)]), - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - doc.write("#1 n0=0", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 3) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#1 n0=auto", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 1) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - mu = 0.20 - curve = pandas.Series( - data=( - [i / 10.0 for i in range(10)] - + [numpy.exp(mu * i / pph) for i in range(25)] - + [numpy.exp(mu * 24 / pph)] * 15 - ), - index=([i / pph for i in range(50)]), - ) - - result = process_curve(curve, constrain_n0=True, n0=0.0) - doc.write("#2 n0=0", result) - print(result.growth_phases) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertAlmostEqual(0.0, result.growth_phases[0].n0, 6) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - result = process_curve(curve, constrain_n0=False) - doc.write("#2 n0=auto", result) - self.assertEqual(len(result.growth_phases), 1) - self.assertAlmostEqual(mu, result.growth_phases[0].slope, 2) - self.assertTrue(-0.25 < result.growth_phases[0].n0 < 0.25) - self.assertTrue(result.growth_phases[0].SNR > 1000) - - def test_process_curve_wrong_time_unit(self): - mu = 0.5 - pph = 0.1 - curve = pandas.Series( - data=[numpy.exp(mu * i / pph) for i in range(100)], - index=[i / pph for i in range(100)], - ) - - result = process_curve(curve, constrain_n0=False, n0=0.0) - self.assertEqual(len(result.growth_phases), 0) - self.assertEqual(list(result.series), list(curve)) - - def test_normalize_time_unit(self): - curve = pandas.Series(index=[0, 15, 30, 60], data=[1, 2, 3, 4]) - self.assertTrue( - pandas.Series(index=[0.0, 0.25, 0.5, 1.0], data=[1, 2, 3, 4]).equals( - normalize_time_unit(curve, "minutes") - ) - ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/api_test.py b/tests/api_test.py new file mode 100644 index 0000000..01ccb0c --- /dev/null +++ b/tests/api_test.py @@ -0,0 +1,157 @@ +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy +import pandas +import pytest + + +from pytest import approx + +from croissance import process_curve, plot_processed_curve +from croissance.figures.writer import PDFWriter + + +def test_process_curve_empty_series(): + result = process_curve(pandas.Series([], dtype="float64")) + + assert result.series.empty + assert result.outliers == [] + assert result.growth_phases == [] + + +def test_process_curve_null_series(): + curve = pandas.Series(index=[1, 2], data=[None, float("NaN")]) + result = process_curve(curve) + + assert result.series is curve + assert result.outliers == [] + assert result.growth_phases == [] + + +def test_process_curve_time_unit(): + mu = 0.5 + pph = 4.0 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + hours = process_curve(curve, unit="hours") + minutes = process_curve( + pandas.Series(index=curve.index * 60.0, data=curve.values), + unit="minutes", + ) + + assert hours.series.equals(minutes.series) + assert hours.outliers.equals(minutes.outliers) + assert hours.growth_phases == minutes.growth_phases + + +def test_process_curve_basic(): + with PDFWriter(Path("test.basic.pdf")) as doc: + mu = 0.5 + pph = 4.0 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + + # self.assertAlmostEqual(mu, slope, 6, msg='growth rate (mu)={}'.format(mu)) + + doc.write("#0 n0=1", result) + print(result.growth_phases) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-2) + assert 0.0 == approx(result.growth_phases[0].n0, 1e-3) + assert result.growth_phases[0].SNR > 1000 + + result = process_curve(curve, constrain_n0=False) + doc.write("#0 n0=auto", result) + print(result.growth_phases) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-2) + assert -0.25 < result.growth_phases[0].n0 < 0.25 + assert result.growth_phases[0].SNR > 1000 + + curve = pandas.Series( + data=( + [1.0] * 5 + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 20 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#1 n0=0", result) + print(result.growth_phases) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-2) + assert 0.0 == approx(result.growth_phases[0].n0, 1e-3) + assert result.growth_phases[0].SNR > 1000 + + result = process_curve(curve, constrain_n0=False) + doc.write("#1 n0=auto", result) + print(result.growth_phases) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-1) + assert -0.25 < result.growth_phases[0].n0 < 0.25 + assert result.growth_phases[0].SNR > 1000 + + mu = 0.20 + curve = pandas.Series( + data=( + [i / 10.0 for i in range(10)] + + [numpy.exp(mu * i / pph) for i in range(25)] + + [numpy.exp(mu * 24 / pph)] * 15 + ), + index=([i / pph for i in range(50)]), + ) + + result = process_curve(curve, constrain_n0=True, n0=0.0) + doc.write("#2 n0=0", result) + print(result.growth_phases) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-2) + assert 0.0 == approx(result.growth_phases[0].n0, 1e-6) + assert result.growth_phases[0].SNR > 1000 + + result = process_curve(curve, constrain_n0=False) + doc.write("#2 n0=auto", result) + assert len(result.growth_phases) == 1 + assert mu == approx(result.growth_phases[0].slope, abs=1e-2) + assert -0.25 < result.growth_phases[0].n0 < 0.25 + assert result.growth_phases[0].SNR > 1000 + + +def test_process_curve_wrong_time_unit(): + mu = 0.5 + pph = 0.1 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + result = process_curve(curve, constrain_n0=False, n0=0.0) + assert len(result.growth_phases) == 0 + assert list(result.series) == list(curve) + + +@pytest.mark.parametrize("yscale, naxis", (("linear", 1), ("log", 1), ("both", 2))) +def test_plot_processed_curve(yscale, naxis): + mu = 0.5 + pph = 4.0 + curve = pandas.Series( + data=[numpy.exp(mu * i / pph) for i in range(100)], + index=[i / pph for i in range(100)], + ) + + curve = process_curve(curve, constrain_n0=True, n0=0.0) + fig, axes = plot_processed_curve(curve, yscale=yscale) + try: + assert len(axes) == naxis + finally: + plt.close() diff --git a/tests/estimation/__init__.py b/tests/estimation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/estimation/main_test.py b/tests/estimation/main_test.py new file mode 100644 index 0000000..9aae500 --- /dev/null +++ b/tests/estimation/main_test.py @@ -0,0 +1,19 @@ +import numpy +import pandas +import pytest + +from croissance.estimation import fit_exponential + + +@pytest.mark.parametrize("mu", (0.001, 0.10, 0.15, 0.50, 1.0)) +def test_regression_basic(mu): + phase = numpy.exp(pandas.Series(range(20)) * mu) + slope, intercept, n0, snr, lin = fit_exponential(phase) + + print(slope, intercept, n0, snr, lin) + + assert not lin, "This fit should not require a linear fallback." + assert n0 == pytest.approx(0, abs=1e-3), "N0=0" + assert intercept == pytest.approx(0, abs=1e-3), '"intercept"=0' + assert mu == pytest.approx(slope, abs=1e-7), "growth rate (mu)={}".format(mu) + assert snr > 100000, "signal-noise ratio should be very good" diff --git a/tests/estimation/util_test.py b/tests/estimation/util_test.py new file mode 100644 index 0000000..c9c9e12 --- /dev/null +++ b/tests/estimation/util_test.py @@ -0,0 +1,11 @@ +import pandas + +from croissance.estimation.util import normalize_time_unit + + +def test_normalize_time_unit(): + curve = pandas.Series(index=[0, 15, 30, 60], data=[1, 2, 3, 4]) + + assert pandas.Series(index=[0.0, 0.25, 0.5, 1.0], data=[1, 2, 3, 4]).equals( + normalize_time_unit(curve, "minutes") + ) diff --git a/tox.ini b/tox.ini index f427e36..06fe06c 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ envlist = py3 [testenv] commands = - pytest test.py \ + pytest tests \ --cov croissance \ --no-cov-on-fail \ --cov-branch \ From 5867065a46c84f6b2aa84f744ef071e639d45fcb Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:40:39 +0100 Subject: [PATCH 41/52] require Python 3.6 to meet scipy requirements --- .travis.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 213d14d..a63517c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: python cache: - pip: true python: - - "3.5" + - "3.6" before_install: - | mkdir -p $HOME/.config/matplotlib && \ diff --git a/setup.py b/setup.py index 762e29c..6f4371e 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "Topic :: Utilities", "Intended Audience :: Science/Research", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "License :: OSI Approved :: Apache Software License", ], ) From 685b872ecf4091ee3c311ee943cb2db5560417ba Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:40:59 +0100 Subject: [PATCH 42/52] prevent unhandled exception for very short curves This includes curves for which only a very small number of measurements are available and curves with hour time-units for which the `--input-time minutes` option was was mistakenly used. --- croissance/estimation/smoothing/segments.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py index f57fc23..b02ddd4 100644 --- a/croissance/estimation/smoothing/segments.py +++ b/croissance/estimation/smoothing/segments.py @@ -79,10 +79,15 @@ def _add_knot(start, end): return pandas.Series(index=index, data=data) -def segment_spline_smoothing(series, series_std_dev=None): +def segment_spline_smoothing(series, series_std_dev=None, k=3): if series_std_dev is None: series_std_dev = series + segments = segment_by_std_dev(series_std_dev) points = segment_points(series, segments).sort_index() - spline = InterpolatedUnivariateSpline(points.index, points.values, k=3) + if len(points) < k + 1: + # InterpolatedUnivariateSpline requires at k + 1 points + return pandas.Series(dtype="float64") + + spline = InterpolatedUnivariateSpline(points.index, points.values, k=k) return pandas.Series(data=spline(series.index), index=series.index) From c14ab48ec0eb319613a9db3b170d48a77919f6ec Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 25 Nov 2020 16:42:20 +0100 Subject: [PATCH 43/52] convert stateless Estimator class to function --- croissance/__init__.py | 9 +- croissance/estimation/__init__.py | 226 ++++++++++++++---------------- croissance/main.py | 18 ++- 3 files changed, 126 insertions(+), 127 deletions(-) diff --git a/croissance/__init__.py b/croissance/__init__.py index 2ffbe9d..bfe4b92 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -1,6 +1,8 @@ +import pandas + import croissance.figures.plot -from croissance.estimation import Estimator, AnnotatedGrowthCurve +from croissance.estimation import AnnotatedGrowthCurve, estimate_growth from croissance.estimation.util import normalize_time_unit @@ -21,11 +23,12 @@ def process_curve( if curve.isnull().all(): return AnnotatedGrowthCurve(curve, [], []) - return Estimator( + return estimate_growth( + curve, segment_log_n0=segment_log_n0, constrain_n0=constrain_n0, n0=n0, - ).growth(curve) + ) def plot_processed_curve(curve: AnnotatedGrowthCurve, yscale="both"): diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index d6bd8da..69bc515 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -48,138 +48,130 @@ def pick_best(growth_phases, metric="duration"): ) -class Estimator: - def __init__( - self, - *, - segment_log_n0: bool = False, - constrain_n0: bool = False, - n0: float = 0.0 - ): - self._log = logging.getLogger(__name__) - self._segment_log_n0 = segment_log_n0 - self._n0 = n0 if constrain_n0 else None - - def _find_growth_phases( - self, curve: "pandas.Series", window - ) -> "typing.List[RawGrowthPhase]": - """ - Finds growth phases by locating regions in a series where both the first and - second derivatives are positive. - """ - first_derivative = savitzky_golay(curve, window, 3, deriv=1) - second_derivative = savitzky_golay(curve, window, 3, deriv=2) - - growth = pandas.Series( - index=curve.index, - data=(first_derivative.values > 0) & (second_derivative.values > 0), +def estimate_growth( + curve: pandas.Series, + *, + segment_log_n0: bool = False, + constrain_n0: bool = False, + n0: float = 0.0, + name: str = "untitled curve" +) -> AnnotatedGrowthCurve: + log = logging.getLogger(__name__) + series = curve.dropna() + + n_hours = int( + numpy.round(points_per_hour(series) * defaults.CURVE_MINIMUM_DURATION_HOURS) + ) + + if n_hours == 0: + log.warning( + "Fewer than one data-point per hour for %s. Use the command-line " + "`--input-time-unit minutes` if times are represented in minutes.", + name, ) + return AnnotatedGrowthCurve(series, pandas.Series(dtype="float64"), []) - istart, iend, phases = None, None, [] - for i, v in zip(growth.index, growth.values): - if v: - if istart is None: - istart = i - elif istart is not None: - phases.append(RawGrowthPhase(istart, iend)) - istart = None - iend = i - - if istart is not None: - phases.append(RawGrowthPhase(istart, iend)) + if n_hours % 2 == 0: + n_hours += 1 - return phases + series, outliers = remove_outliers(series, window=n_hours, std=3) - def growth( - self, curve: pandas.Series, name: str = "untitled curve" - ) -> AnnotatedGrowthCurve: - series = curve.dropna() + # NOTE workaround for issue with negative curves + if len(series[series > 0]) < 3: + log.warning("Fewer than three positive data-points for %s", name) + return AnnotatedGrowthCurve(series, outliers, []) - n_hours = int( - numpy.round(points_per_hour(series) * defaults.CURVE_MINIMUM_DURATION_HOURS) - ) + if segment_log_n0: + series_log_n0 = numpy.log(series - n0).dropna() + smooth_series = segment_spline_smoothing(series, series_log_n0) + else: + smooth_series = segment_spline_smoothing(series) - if n_hours == 0: - self._log.warning( - "Fewer than one data-point per hour for %s. Use the command-line " - "`--input-time-unit minutes` if times are represented in minutes.", - name, - ) - return AnnotatedGrowthCurve(series, pandas.Series(dtype="float64"), []) + if len(smooth_series) < n_hours: + log.warning("Insufficient smoothed data for %s", name) + return AnnotatedGrowthCurve(series, outliers, []) - if n_hours % 2 == 0: - n_hours += 1 + phases = [] + for phase in _find_growth_phases(smooth_series, window=n_hours): + phase_series = series[phase.start : phase.end] - series, outliers = remove_outliers(series, window=n_hours, std=3) + # skip any growth phases that have not enough points for fitting + if len(phase_series[phase_series > 0]) < 3: + continue - # NOTE workaround for issue with negative curves - if len(series[series > 0]) < 3: - self._log.warning("Fewer than three positive data-points for %s", name) - return AnnotatedGrowthCurve(series, outliers, []) + # skip any phases with less than minimum duration + if phase.duration < defaults.PHASE_MINIMUM_DURATION_HOURS: + continue - if self._segment_log_n0: - series_log_n0 = numpy.log(series - self._n0).dropna() - smooth_series = segment_spline_smoothing(series, series_log_n0) - else: - smooth_series = segment_spline_smoothing(series) + slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( + phase_series, n0=n0 if constrain_n0 else None + ) - phases = [] - if len(smooth_series) < n_hours: - self._log.warning("Insufficient smoothed data for %s", name) - return AnnotatedGrowthCurve(series, outliers, []) - raw_phases = self._find_growth_phases(smooth_series, window=n_hours) + # skip phases whose actual slope is below the limit + if slope < max(0.0, defaults.PHASE_MINIMUM_SLOPE): + continue + + # skip phases whose actual signal-noise-ratio is below the limit + if snr < max(1.0, defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO): + continue + + phases.append( + GrowthPhase( + start=phase.start, + end=phase.end, + slope=slope, + intercept=intercept, + n0=n0, + SNR=snr, + rank=None, + ) + ) - for phase in raw_phases: - phase_series = series[phase.start : phase.end] + ranked_phases = rank_phases( + phases, + defaults.PHASE_RANK_WEIGHTS, + thresholds={ + "SNR": defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO, + "duration": defaults.PHASE_MINIMUM_DURATION_HOURS, + "slope": defaults.PHASE_MINIMUM_SLOPE, + }, + ) - # skip any growth phases that have not enough points for fitting - if len(phase_series[phase_series > 0]) < 3: - continue + return AnnotatedGrowthCurve( + series, + outliers, + [ + phase + for phase in ranked_phases + if phase.rank >= defaults.PHASE_RANK_EXCLUDE_BELOW + ], + ) - # skip any phases with less than minimum duration - if phase.duration < defaults.PHASE_MINIMUM_DURATION_HOURS: - continue - slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( - phase_series, n0=self._n0 - ) +def _find_growth_phases(curve: "pandas.Series", window): + """ + Finds growth phases by locating regions in a series where both the first and + second derivatives are positive. + """ + first_derivative = savitzky_golay(curve, window, 3, deriv=1) + second_derivative = savitzky_golay(curve, window, 3, deriv=2) - # skip phases whose actual slope is below the limit - if slope < max(0.0, defaults.PHASE_MINIMUM_SLOPE): - continue - - # skip phases whose actual signal-noise-ratio is below the limit - if snr < max(1.0, defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO): - continue - - phases.append( - GrowthPhase( - start=phase.start, - end=phase.end, - slope=slope, - intercept=intercept, - n0=n0, - SNR=snr, - rank=None, - ) - ) + growth = pandas.Series( + index=curve.index, + data=(first_derivative.values > 0) & (second_derivative.values > 0), + ) - ranked_phases = rank_phases( - phases, - defaults.PHASE_RANK_WEIGHTS, - thresholds={ - "SNR": defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO, - "duration": defaults.PHASE_MINIMUM_DURATION_HOURS, - "slope": defaults.PHASE_MINIMUM_SLOPE, - }, - ) + istart, iend, phases = None, None, [] + for i, v in zip(growth.index, growth.values): + if v: + if istart is None: + istart = i + elif istart is not None: + phases.append(RawGrowthPhase(istart, iend)) + istart = None + iend = i - return AnnotatedGrowthCurve( - series, - outliers, - [ - phase - for phase in ranked_phases - if phase.rank >= defaults.PHASE_RANK_EXCLUDE_BELOW - ], - ) + if istart is not None: + phases.append(RawGrowthPhase(istart, iend)) + + return phases diff --git a/croissance/main.py b/croissance/main.py index fa6c1c8..4856c21 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -10,7 +10,7 @@ import coloredlogs -from croissance import Estimator +from croissance import estimate_growth from croissance.estimation.util import normalize_time_unit from croissance.figures.writer import PDFWriter from croissance.formats.input import TSVReader @@ -19,11 +19,9 @@ class EstimatorWrapper: def __init__(self, args): - self.estimator = Estimator( - constrain_n0=args.constrain_N0, - segment_log_n0=args.segment_log_N0, - n0=args.N0, - ) + self.constrain_n0 = args.constrain_N0 + self.segment_log_n0 = args.segment_log_N0 + self.n0 = args.N0 self.input_time_unit = args.input_time_unit @@ -32,7 +30,13 @@ def __call__(self, values): try: normalized_curve = normalize_time_unit(curve, self.input_time_unit) - annotated_curve = self.estimator.growth(normalized_curve, name) + annotated_curve = estimate_growth( + normalized_curve, + segment_log_n0=self.segment_log_n0, + constrain_n0=self.constrain_n0, + n0=self.n0, + name=name, + ) return (filepath, idx, name, annotated_curve) except Exception: From a569332e72ef0d445b90e8d203053e312abb2a57 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 9 Dec 2020 10:44:19 +0100 Subject: [PATCH 44/52] removed unused constants --- croissance/estimation/defaults.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/croissance/estimation/defaults.py b/croissance/estimation/defaults.py index f074476..b0cb31c 100644 --- a/croissance/estimation/defaults.py +++ b/croissance/estimation/defaults.py @@ -11,16 +11,3 @@ "slope": 10, # TODO add 1 - start? } - - -# NOTE these defaults are good for fast growing bacteria such as E.coli: - -MINIMUM_VALID_OD = 0.05 -MINIMUM_VALID_SLOPE = 0.05 -MINIMUM_DATA_HOURS = 5 -RESAMPLE_POINTS_PER_HOUR = 6 -PHASE_MIN_DELTA_LOG_OD = 0.20 -PHASE_MIN_DELTA_OD = 0.25 -MINIMUM_PHASE_LENGTH = 2 - -MINOR_PHASE_MERGING_THRESHOLD = 0.10 # in percent From cfa281b833c09ae0fd4359edfbec79cac00293a0 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 9 Dec 2020 11:44:14 +0100 Subject: [PATCH 45/52] cleanup command-line arguments --- croissance/main.py | 69 +++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/croissance/main.py b/croissance/main.py index 4856c21..d9c5436 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -58,44 +58,44 @@ def parse_args(argv): ) parser.add_argument("infiles", type=Path, nargs="+") - parser.add_argument("--input-format", type=str.lower, choices=("tsv",)) + parser.add_argument( + "--threads", + type=int, + default=1, + help="Max number of threads to use during growth estimation", + ) + + group = parser.add_argument_group("Input") + group.add_argument("--input-format", type=str.lower, choices=("tsv",)) + group.add_argument( "--input-time-unit", default="hours", choices=("hours", "minutes"), help="Time unit in time column", ) - parser.add_argument( - "--N0", - type=float, - default=0.0, - help="Baseline for constrained regressions and identifying curve segments in " - "log space", - ) - parser.add_argument( - "--constrain-N0", - action="store_true", - help="All growth phases will begin at N0 when this flag is set", - ) - parser.add_argument( - "--segment-log-N0", - action="store_true", - help="Identify curve segments using log(N-N0) rather than N; increases " - "sensitivity to changes in exponential growth but has to assume a certain N0", + + group = parser.add_argument_group("Output") + group.add_argument("--output-format", type=str.lower, choices=("tsv",)) + group.add_argument( + "--output-suffix", + type=str, + default=".output", + help="Suffix used for output files in addition to the file extension. By " + "default, an input file `file.tsv` will result in output files named `file. " + "output.tsv` and `file.output.pdf`", ) - parser.add_argument("--output-format", type=str.lower, choices=("tsv",)) - parser.add_argument("--output-suffix", type=str, default=".output") - parser.add_argument( + group.add_argument( "--output-exclude-default-phase", action="store_true", help="Do not output phase '0' for each curve", ) - parser.add_argument( + group.add_argument( "--figures", action="store_true", help="Renders a PDF file with figures for each curve", ) - parser.add_argument( + group.add_argument( "--figures-yscale", type=str.lower, default="both", @@ -103,11 +103,24 @@ def parse_args(argv): help="Yscale(s) for figures. If both, then two plots are generated per curve", ) - parser.add_argument( - "--threads", - type=int, - default=1, - help="Max number of threads to use during growth estimation", + group = parser.add_argument_group("Growth model") + group.add_argument( + "--N0", + type=float, + default=0.0, + help="Baseline for constrained regressions and identifying curve segments in " + "log space", + ) + group.add_argument( + "--constrain-N0", + action="store_true", + help="All growth phases will begin at N0 when this flag is set", + ) + group.add_argument( + "--segment-log-N0", + action="store_true", + help="Identify curve segments using log(N-N0) rather than N; increases " + "sensitivity to changes in exponential growth but has to assume a certain N0", ) group = parser.add_argument_group("Logging") From aab72c5eb9bf827cc0fc8a6e72246c96e5467729 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 9 Dec 2020 12:42:51 +0100 Subject: [PATCH 46/52] expose phase quality filters on command-line --- croissance/__init__.py | 18 +++++---- croissance/estimation/__init__.py | 64 +++++++++++++++++++++++-------- croissance/estimation/defaults.py | 13 ------- croissance/main.py | 44 +++++++++++++++++---- 4 files changed, 94 insertions(+), 45 deletions(-) delete mode 100644 croissance/estimation/defaults.py diff --git a/croissance/__init__.py b/croissance/__init__.py index bfe4b92..25a4e97 100644 --- a/croissance/__init__.py +++ b/croissance/__init__.py @@ -2,7 +2,11 @@ import croissance.figures.plot -from croissance.estimation import AnnotatedGrowthCurve, estimate_growth +from croissance.estimation import ( + AnnotatedGrowthCurve, + GrowthEstimationParameters, + estimate_growth, +) from croissance.estimation.util import normalize_time_unit @@ -23,12 +27,12 @@ def process_curve( if curve.isnull().all(): return AnnotatedGrowthCurve(curve, [], []) - return estimate_growth( - curve, - segment_log_n0=segment_log_n0, - constrain_n0=constrain_n0, - n0=n0, - ) + params = GrowthEstimationParameters() + params.segment_log_n0 = segment_log_n0 + params.constrain_n0 = constrain_n0 + params.n0 = n0 + + return estimate_growth(curve, params=params) def plot_processed_curve(curve: AnnotatedGrowthCurve, yscale="both"): diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py index 69bc515..467e4f9 100644 --- a/croissance/estimation/__init__.py +++ b/croissance/estimation/__init__.py @@ -6,8 +6,6 @@ import numpy import pandas -import croissance.estimation.defaults as defaults - from croissance.estimation.outliers import remove_outliers from croissance.estimation.ranking import rank_phases from croissance.estimation.regression import fit_exponential @@ -48,19 +46,51 @@ def pick_best(growth_phases, metric="duration"): ) +class GrowthEstimationParameters: + __slots__ = [ + "segment_log_n0", + "constrain_n0", + "n0", + "curve_minimum_duration_hours", + "phase_minimum_signal_noise_ratio", + "phase_minimum_duration_hours", + "phase_minimum_slope", + "phase_rank_exclude_below", + "phase_rank_weights", + ] + + def __init__(self): + self.segment_log_n0 = False + self.constrain_n0 = False + self.n0 = 0.0 + + self.curve_minimum_duration_hours = 5 + self.phase_minimum_signal_noise_ratio = 1.0 + self.phase_minimum_duration_hours = 1.5 + self.phase_minimum_slope = 0.005 + + self.phase_rank_exclude_below = 33 + self.phase_rank_weights = { + "SNR": 50, + "duration": 30, + "slope": 10, + # TODO add 1 - start? + } + + def estimate_growth( curve: pandas.Series, *, - segment_log_n0: bool = False, - constrain_n0: bool = False, - n0: float = 0.0, + params=GrowthEstimationParameters(), name: str = "untitled curve" ) -> AnnotatedGrowthCurve: log = logging.getLogger(__name__) series = curve.dropna() n_hours = int( - numpy.round(points_per_hour(series) * defaults.CURVE_MINIMUM_DURATION_HOURS) + numpy.round( + points_per_hour(series) * max(1, params.curve_minimum_duration_hours) + ) ) if n_hours == 0: @@ -81,8 +111,8 @@ def estimate_growth( log.warning("Fewer than three positive data-points for %s", name) return AnnotatedGrowthCurve(series, outliers, []) - if segment_log_n0: - series_log_n0 = numpy.log(series - n0).dropna() + if params.segment_log_n0: + series_log_n0 = numpy.log(series - params.n0).dropna() smooth_series = segment_spline_smoothing(series, series_log_n0) else: smooth_series = segment_spline_smoothing(series) @@ -100,19 +130,19 @@ def estimate_growth( continue # skip any phases with less than minimum duration - if phase.duration < defaults.PHASE_MINIMUM_DURATION_HOURS: + if phase.duration < max(0.0, params.phase_minimum_duration_hours): continue slope, intercept, n0, snr, _fallback_linear_method = fit_exponential( - phase_series, n0=n0 if constrain_n0 else None + phase_series, n0=params.n0 if params.constrain_n0 else None ) # skip phases whose actual slope is below the limit - if slope < max(0.0, defaults.PHASE_MINIMUM_SLOPE): + if slope < max(0.0, params.phase_minimum_slope): continue # skip phases whose actual signal-noise-ratio is below the limit - if snr < max(1.0, defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO): + if snr < max(1.0, params.phase_minimum_signal_noise_ratio): continue phases.append( @@ -129,11 +159,11 @@ def estimate_growth( ranked_phases = rank_phases( phases, - defaults.PHASE_RANK_WEIGHTS, + params.phase_rank_weights, thresholds={ - "SNR": defaults.PHASE_MINIMUM_SIGNAL_NOISE_RATIO, - "duration": defaults.PHASE_MINIMUM_DURATION_HOURS, - "slope": defaults.PHASE_MINIMUM_SLOPE, + "duration": max(0.0, params.phase_minimum_duration_hours), + "slope": max(0.0, params.phase_minimum_slope), + "SNR": max(1.0, params.phase_minimum_signal_noise_ratio), }, ) @@ -143,7 +173,7 @@ def estimate_growth( [ phase for phase in ranked_phases - if phase.rank >= defaults.PHASE_RANK_EXCLUDE_BELOW + if phase.rank >= params.phase_rank_exclude_below ], ) diff --git a/croissance/estimation/defaults.py b/croissance/estimation/defaults.py deleted file mode 100644 index b0cb31c..0000000 --- a/croissance/estimation/defaults.py +++ /dev/null @@ -1,13 +0,0 @@ -CURVE_MINIMUM_DURATION_HOURS = 5 - -PHASE_MINIMUM_SIGNAL_NOISE_RATIO = 1.0 -PHASE_MINIMUM_DURATION_HOURS = 1.5 -PHASE_MINIMUM_SLOPE = 0.005 - -PHASE_RANK_EXCLUDE_BELOW = 33 -PHASE_RANK_WEIGHTS = { - "SNR": 50, - "duration": 30, - "slope": 10, - # TODO add 1 - start? -} diff --git a/croissance/main.py b/croissance/main.py index d9c5436..e546c4d 100644 --- a/croissance/main.py +++ b/croissance/main.py @@ -10,7 +10,7 @@ import coloredlogs -from croissance import estimate_growth +from croissance import GrowthEstimationParameters, estimate_growth from croissance.estimation.util import normalize_time_unit from croissance.figures.writer import PDFWriter from croissance.formats.input import TSVReader @@ -19,9 +19,17 @@ class EstimatorWrapper: def __init__(self, args): - self.constrain_n0 = args.constrain_N0 - self.segment_log_n0 = args.segment_log_N0 - self.n0 = args.N0 + self.params = GrowthEstimationParameters() + + self.params.constrain_n0 = args.constrain_N0 + self.params.segment_log_n0 = args.segment_log_N0 + self.params.n0 = args.N0 + + self.params.phase_minimum_signal_noise_ratio = ( + args.phase_minimum_signal_to_noise + ) + self.params.phase_minimum_duration_hours = args.phase_minimum_duration + self.params.phase_minimum_slope = args.phase_minimum_slope self.input_time_unit = args.input_time_unit @@ -32,9 +40,7 @@ def __call__(self, values): normalized_curve = normalize_time_unit(curve, self.input_time_unit) annotated_curve = estimate_growth( normalized_curve, - segment_log_n0=self.segment_log_n0, - constrain_n0=self.constrain_n0, - n0=self.n0, + params=self.params, name=name, ) @@ -103,11 +109,12 @@ def parse_args(argv): help="Yscale(s) for figures. If both, then two plots are generated per curve", ) + defaults = GrowthEstimationParameters() group = parser.add_argument_group("Growth model") group.add_argument( "--N0", type=float, - default=0.0, + default=defaults.n0, help="Baseline for constrained regressions and identifying curve segments in " "log space", ) @@ -122,6 +129,27 @@ def parse_args(argv): help="Identify curve segments using log(N-N0) rather than N; increases " "sensitivity to changes in exponential growth but has to assume a certain N0", ) + group.add_argument( + "--phase-minimum-signal-to-noise", + type=float, + metavar="SNR", + default=defaults.phase_minimum_signal_noise_ratio, + help="Minimum phase signal-to-noise ratio", + ) + group.add_argument( + "--phase-minimum-duration", + type=float, + metavar="HOURS", + default=defaults.phase_minimum_duration_hours, + help="Minimum duration of phases in hours", + ) + group.add_argument( + "--phase-minimum-slope", + type=float, + metavar="SLOPE", + default=defaults.phase_minimum_slope, + help="Minimum phase slope", + ) group = parser.add_argument_group("Logging") group.add_argument( From 1f5bad0bfeec7b0175cea959ea61d99d391f0ec0 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 9 Dec 2020 12:42:59 +0100 Subject: [PATCH 47/52] tweak travis config --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a63517c..dd3e3e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,4 +10,4 @@ before_install: install: - python setup.py install - pip install pytest -script: pytest +script: pytest tests From 8faeb7822cdf5689604392cc5b0274c3684f03bf Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Wed, 9 Dec 2020 12:45:12 +0100 Subject: [PATCH 48/52] simplify setup.py --- setup.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 6f4371e..165f32a 100644 --- a/setup.py +++ b/setup.py @@ -13,20 +13,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import unicode_literals -import codecs - -from setuptools import setup, find_packages +from setuptools import setup setup( name="croissance", version="1.1.1", - packages=find_packages(exclude=["*tests*"]), + packages=["croissance"], url="https://github.com/biosustain/croissance", author="Lars Schöning", author_email="lays@biosustain.dtu.dk", description="A tool for estimating growth rates in growth curves.", - long_description=codecs.open("README.rst", encoding="utf-8").read(), + long_description=open("README.rst", encoding="utf-8").read(), license="Apache License Version 2.0", entry_points={ "console_scripts": [ From afecf701053ccffb59fbe8c48c9dcc6ab9af4d99 Mon Sep 17 00:00:00 2001 From: Mikkel Schubert Date: Thu, 10 Dec 2020 14:35:11 +0100 Subject: [PATCH 49/52] alternative install method --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index dd3e3e0..626c061 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,6 @@ before_install: mkdir -p $HOME/.config/matplotlib && \ echo 'backend: agg' > $HOME/.config/matplotlib/matplotlibrc install: - - python setup.py install + - pip install . - pip install pytest script: pytest tests From 1142c904cf80515117f97c518dae0472eebe424f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emre=20=C3=96zdemir?= Date: Tue, 19 Jan 2021 10:20:13 +0100 Subject: [PATCH 50/52] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 165f32a..89e874b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="croissance", - version="1.1.1", + version="1.2.0", packages=["croissance"], url="https://github.com/biosustain/croissance", author="Lars Schöning", From 6554cb6d4908a7a24a320e120d44850d6226c3b9 Mon Sep 17 00:00:00 2001 From: meono Date: Tue, 19 Jan 2021 11:55:45 +0100 Subject: [PATCH 51/52] previous form seems to exclude submodules from install. --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 89e874b..fc6a604 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from setuptools import setup +from setuptools import setup, find_packages setup( name="croissance", version="1.2.0", - packages=["croissance"], + packages=find_packages(where="croissance", exclude=["tests"]), + package_dir={"": "croissance"}, url="https://github.com/biosustain/croissance", author="Lars Schöning", author_email="lays@biosustain.dtu.dk", From 35c50f3e93e0d027b2113756473210e54db5948c Mon Sep 17 00:00:00 2001 From: meono Date: Thu, 21 Jan 2021 09:17:24 +0100 Subject: [PATCH 52/52] path got messed up for submodules, so reverting to simple version wihtout exluding test. --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index fc6a604..8faec7e 100644 --- a/setup.py +++ b/setup.py @@ -18,8 +18,7 @@ setup( name="croissance", version="1.2.0", - packages=find_packages(where="croissance", exclude=["tests"]), - package_dir={"": "croissance"}, + packages=find_packages(), url="https://github.com/biosustain/croissance", author="Lars Schöning", author_email="lays@biosustain.dtu.dk",