kr-colab · bruce-edelman · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/.github/workflows/deploy.yaml → .github/workflows/deprecated/deploy.yaml b/.github/workflows/deploy.yaml → .github/workflows/deprecated/deploy.yaml
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml
@@ -0,0 +1,37 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Build
+
+on: push
+
+jobs:
+  Build_Package:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Setup conda
+      uses: s-weigand/setup-conda@v1
+      with:
+        activate-conda: true
+        update-conda: true
+        python-version: ${{ matrix.python-version }}
+        conda-channels: conda-forge
+    - name: Install dependencies 
+      run: |
+        conda install pip setuptools
+        pip install --upgrade pip
+    - name: Install diploSHIC
+      run: |
+        pip install .
+    - name: List installed
+      run: |
+        conda list
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -6,12 +6,18 @@
 # separate terms of service, privacy policy, and support
 # documentation.
 
-name: Upload Python Package
+name: Publish
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - main
+      - master
+    tags:
+      - v*
 
 jobs:
-  manylinux:
+  Build_Wheel:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout

diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+build
+diploSHIC.egg-info
+work
diff --git a/README.md b/README.md
@@ -1,3 +1,6 @@
+[![Build](https://github.com/kr-colab/diploSHIC/actions/workflows/python-build.yml/badge.svg)](https://github.com/kr-colab/diploSHIC/actions/workflows/python-build.yml)
+[![PyPI version](https://badge.fury.io/py/diploSHIC.svg)](https://badge.fury.io/py/diploSHIC)
+
 # diploS/HIC
 This repo contains the implementation for `diploS/HIC` as described in Kern and Schrider (2018; https://doi.org/10.1534/g3.118.200262), along 
 with its associated support scripts. `diploS/HIC` uses a deep convolutional neural network to identify

diff --git a/diploshic/__init__.py b/diploshic/__init__.py
@@ -1,3 +1,7 @@
 from diploshic.fvTools import *
 from diploshic.msTools import *
 from diploshic.shicstats import *
+from . import network
+from . import dataloader
+from . import misc
+from . import parser
diff --git a/diploshic/dataloader.py b/diploshic/dataloader.py
@@ -0,0 +1,101 @@
+from keras.utils import Sequence
+import numpy as np
+import gc
+
+
+def load_fvecs_from_directory(directory, n_subwin=11):
+  hard = np.loadtxt(directory + "hard.fvec", skiprows=1)
+  nDims = int(hard.shape[1] / n_subwin)
+  h1 = np.reshape(hard, (hard.shape[0], nDims, n_subwin))
+  neut = np.loadtxt(directory + "neut.fvec", skiprows=1)
+  n1 = np.reshape(neut, (neut.shape[0], nDims, n_subwin))
+  soft = np.loadtxt(directory + "soft.fvec", skiprows=1)
+  s1 = np.reshape(soft, (soft.shape[0], nDims, n_subwin))
+  lsoft = np.loadtxt(directory + "linkedSoft.fvec", skiprows=1)
+  ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, n_subwin))
+  lhard = np.loadtxt(directory + "linkedHard.fvec", skiprows=1)
+  lh1 = np.reshape(lhard, (lhard.shape[0], nDims, n_subwin))
+  both = np.concatenate((h1, n1, s1, ls1, lh1))
+  y = np.concatenate((np.repeat(0, len(h1)),
+                      np.repeat(1, len(n1)),
+                      np.repeat(2, len(s1)),
+                      np.repeat(3, len(ls1)),
+                      np.repeat(4, len(lh1)),))
+  return both.reshape(both.shape[0], nDims, n_subwin, 1), y
+
+
+def load_empirical_fvecs_from_directory(directory, n_subwin=11):
+  nDims =  int(emp.shape[1] / n_subwin)
+  emp = np.loadtxt(directory + "empirical.fvec", skiprows=1)
+  emp = np.reshape(emp, (emp.shape[0], nDims, n_subwin))
+  return emp.reshape(emp, emp.shape[0], nDims, n_subwin, 1)
+
+
+class DADiploSHICDataLoader(Sequence):
+  def __init__(self, X_src, X_tgt, Y_pred, batch_size):
+    self.tgt_data = X_tgt
+    self.src_data = X_src
+    self.y_pred = Y_pred
+
+    self.batch_size = batch_size
+    src_size = self.src_data.shape[0]
+    tgt_size = self.tgt_data.shape[0]
+
+    self.no_batch = int(np.floor(np.minimum(src_size, tgt_size) / self.batch_size)) # model sees training sample at most once per epoch
+    self.src_pred_idx = np.arange(src_size)
+    self.src_discr_idx = np.arange(src_size)
+    self.tgt_discr_idx = np.arange(tgt_size)
+
+    np.random.shuffle(self.src_pred_idx)
+    np.random.shuffle(self.src_discr_idx)
+    np.random.shuffle(self.tgt_discr_idx)
+
+  def __len__(self):
+    return self.no_batch
+
+  def on_epoch_end(self):
+    np.random.shuffle(self.src_pred_idx)
+    np.random.shuffle(self.src_discr_idx)
+    np.random.shuffle(self.tgt_discr_idx)
+    gc.collect()
+
+  def __getitem__(self, idx):
+    pred_batch_idx = self.src_pred_idx[idx*self.batch_size:(idx+1)*self.batch_size]
+    discrSrc_batch_idx = self.src_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)]
+    discrTgt_batch_idx = self.tgt_discr_idx[idx*(self.batch_size//2):(idx+1)*(self.batch_size//2)]
+    batch_X = np.concatenate((self.src_data[pred_batch_idx],
+                          self.src_data[discrSrc_batch_idx],
+                          self.tgt_data[discrTgt_batch_idx]))
+    batch_Y_pred = np.concatenate((self.y_pred[pred_batch_idx],
+                                     -1*np.ones((len(discrSrc_batch_idx), self.y_pred.shape[1])),
+                                     -1*np.ones((len(discrTgt_batch_idx), self.y_pred.shape[1]))))
+    batch_Y_discr = np.concatenate((-1*np.ones(len(pred_batch_idx)),
+                                    np.zeros(len(discrSrc_batch_idx)),
+                                    np.ones(len(discrTgt_batch_idx))))
+    assert batch_X.shape[0] == self.batch_size*2, (batch_X.shape, self.batch_size*2)
+    assert batch_Y_pred.shape[0] == batch_Y_discr.shape[0], (batch_Y_pred.shape, batch_Y_discr.shape)
+    return batch_X, {"predictor":batch_Y_pred, "discriminator":batch_Y_discr}
+
+
+class DiploSHICDataLoader(Sequence):
+  def __init__(self, X_src, Y_pred, batch_size):
+    self.data = X_src
+    self.y_pred = Y_pred
+    self.batch_size = batch_size
+    size = self.data.shape[0]
+    self.no_batch = int(np.floor(size/ self.batch_size))
+    self.pred_idx = np.arange(size)
+    np.random.shuffle(self.pred_idx)
+
+  def __len__(self):
+    return self.no_batch
+
+  def on_epoch_end(self):
+    np.random.shuffle(self.pred_idx)
+    gc.collect()
+
+  def __getitem__(self, idx):
+    pred_batch_idx = self.pred_idx[idx*self.batch_size:(idx+1)*self.batch_size]
+    batch_X = self.data[pred_batch_idx]
+    batch_Y_pred = self.y_pred[pred_batch_idx]
+    return batch_X, batch_Y_pred