Merge pull request #4 from meono/master

Some fixes related to edge cases and pandas fetures required
biosustain · Jul 14, 2020 · e9a9251 · e9a9251
2 parents da57133 + 102d8ae
commit e9a9251
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 8 deletions.
diff --git a/README.rst b/README.rst
@@ -20,7 +20,11 @@ Description
 
 A tool for estimating growth rates in growth curves. The tool fits λ ⋅ e :sup:`μ⋅x` + N :sub:`0` to any candidate growth phases of the growth curve that have increasing growth, i.e. where both the first and second derivative of the growth function are positive. To identify these phases reliably, the tool utilizes a custom smoothing function that addresses problems other smoothing methods have with growth curves that have regions with varying levels of noise (e.g. lots of noise in the beginning, then less noise after growth starts, then more noise in the stationary phase). 
 
-The parameter N :sub:`0` of the model can optionally be constrained. This is recommended if the value is known. The growth rate in calculated growth phases can only be properly compared if their N :sub:`0` (baseline OD; when the organism is at its initial population) points to a similar stage of actual growth.
+The parameter N :sub:`0` represents the background/blank OD reading (not seeding OD) and can optionally be constrained. This is recommended if the value is known.
+
+The growth rate in calculated growth phases can only be properly compared if their seeding OD (when the organism is at its initial population) points to a similar stage of actual growth.
+
+Intercept (λ) reported by this package can be used as indicator of lag if SNR is sufficiently high.
 
 Installation
 ============

diff --git a/croissance/__init__.py b/croissance/__init__.py
@@ -1,8 +1,13 @@
 from croissance.estimation import Estimator
+from collections import namedtuple
+
+AnnotatedGrowthCurve = namedtuple('AnnotatedGrowthCurve', ('series', 'outliers', 'growth_phases'))
 
 
 def process_curve(curve: 'pandas.Series', **kwargs):
     estimator = Estimator(**kwargs)
+    if curve.isnull().all():
+        return AnnotatedGrowthCurve(curve, [], [])
     return estimator.growth(curve)
 
 

diff --git a/croissance/__main__.py b/croissance/__main__.py
@@ -56,6 +56,7 @@ def main():
                           n0=args.N0)
 
     try:
+        empties = {}
         for infile in tqdm(args.infiles, unit='infile'):
             outfile = open('{}{}.tsv'.format(infile.name[:-4], args.output_suffix), 'w')
 
@@ -66,6 +67,12 @@ def main():
             outwriter = TSVWriter(outfile, include_default_phase=not args.output_exclude_default_phase)
 
             for name, curve in tqdm(list(reader.read(infile)), unit='curve'):
+                if curve.empty:
+                    try:
+                        empties[infile.name].append(name)
+                    except KeyError:
+                        empties[infile.name] = [name]
+                    continue
                 annotated_curve = estimator.growth(normalize_time_unit(curve, args.input_time_unit))
 
                 outwriter.write(name, annotated_curve)
@@ -80,7 +87,10 @@ def main():
     except KeyboardInterrupt:
         pass
 
-    print()
+    if empties != {}:
+        print('\nEmpty cells were found and discarded:\n', '\n'.join([(infile.name+'\t'+name) for key, names in empties.items() for name in names]))
+    else:
+        print()
 
 
 if __name__ == '__main__':

diff --git a/croissance/estimation/__init__.py b/croissance/estimation/__init__.py
@@ -115,6 +115,9 @@ def growth(self, curve: pandas.Series) -> AnnotatedGrowthCurve:
             smooth_series = segment_spline_smoothing(series, )
 
         phases = []
+        # give up if there isn't enough data
+        if len(smooth_series) < n_hours:
+            return AnnotatedGrowthCurve(series, [], [])
         raw_phases = self._find_growth_phases(smooth_series, window=n_hours)
 
         for phase in raw_phases:

diff --git a/croissance/estimation/regression.py b/croissance/estimation/regression.py
@@ -24,7 +24,12 @@ def exponential_constrain_n0(x, a, b):
         p0 = p0[:2]
 
     try:
-        popt, pcov = curve_fit(fit_fn, series.index, series.values, p0=p0, maxfev=10000)
+        popt, pcov = curve_fit(fit_fn,
+                               series.index,
+                               series.values,
+                               p0=p0,
+                               maxfev=10000,
+                               bounds=([0., 0., 0.], numpy.inf) if n0 is None else ([0., 0.], numpy.inf))
 
         if n0 is not None:
             popt = tuple(popt) + (n0,)
@@ -50,7 +55,12 @@ def exponential_constrain_n0(x, a, b):
         p0 = (numpy.exp(c), slope)
 
     try:
-        popt, pcov = curve_fit(fit_fn, series.index, series.values, p0=p0, maxfev=10000)
+        popt, pcov = curve_fit(fit_fn,
+                               series.index,
+                               series.values,
+                               p0=p0,
+                               maxfev=10000,
+                               bounds=([0., 0., 0.], numpy.inf) if n0 is None else ([0., 0.], numpy.inf))
 
         if n0 is not None:
             popt = tuple(popt) + (n0,)

diff --git a/croissance/estimation/smoothing/segments.py b/croissance/estimation/smoothing/segments.py
@@ -16,10 +16,11 @@ def segment_by_std_dev(series, increment=2, maximum=20):
     :param maximum:
     :return:
     """
+    start = int(series.index.min())
     duration = int(series.index[-2])
     windows = []
 
-    for i in range(0, duration, increment):
+    for i in range(start, duration, increment):
         for size in range(1, maximum + 1):
             window = detrend(series[i:i + size*increment])
             heappush(windows, (window.std() / (size*increment), i, i + size*increment))
@@ -48,7 +49,7 @@ def segment_by_std_dev(series, increment=2, maximum=20):
 def window_median(window, start, end):
     x = numpy.linspace(0, 1, num=len(window))
     A = numpy.vstack([x, numpy.ones(len(x))]).T
-    m, c = numpy.linalg.lstsq(A, window)[0]
+    m, c = numpy.linalg.lstsq(A, window, rcond=None)[0]
 
     return (start + end) / 2, m * 0.5 + numpy.median(window - m * x)
 
@@ -70,6 +71,9 @@ def segment_points(series, segments):
     for start, end in segments:
         window = series[start:end]
 
+        if window.empty:
+            continue
+
         if end - start > 5:
             out.append(window_median(series[start:start + 2], start, start + 2))
 
@@ -90,6 +94,6 @@ def segment_spline_smoothing(series, series_std_dev=None):
     if series_std_dev is None:
         series_std_dev = series
     segments = segment_by_std_dev(series_std_dev)
-    points = segment_points(series, segments)
+    points = segment_points(series, segments).sort_index()
     spline = InterpolatedUnivariateSpline(points.index, points.values, k=3)
     return pandas.Series(data=spline(series.index), index=series.index)
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
     ],
     install_requires=[
         'numpy>=1.9.1',
-        'pandas>=0.15.2',
+        'pandas>=0.18.0',
         'scipy>=0.14.0',
         'matplotlib>=1.4.3',
         'tqdm>=4.11.2'