misc progress

remrama · Jun 4, 2024 · 812b5cb · 812b5cb
1 parent 2bad1bc
commit 812b5cb
Show file tree

Hide file tree

Showing 7 changed files with 129 additions and 24 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -1,10 +1,11 @@
+# Build and Deploy Sphinx Docs to GitHub Pages
 # Workflow for building sphinx documentation remotely
 # and deploying the static HTML to a GitHub Pages site.
 # Note this approach does not require the static HTML
 # files to be stored/pushed/committed to an alternate
 # pages branch or anything like that. It uses GitHub
 # Action artifacts...
-name: Build and Deploy Sphinx Docs to GitHub Pages
+name: Deploy Docs
 
 
 on:

diff --git a/README.rst b/README.rst
@@ -1,9 +1,15 @@
 
 .. image:: https://badge.fury.io/py/krank.svg
    :target: https://badge.fury.io/py/krank
+   :alt: PyPI
 
 .. image:: https://img.shields.io/badge/code%20style-black-000000.svg
    :target: https://github.com/psf/black
+   :alt: Black
+
+.. image:: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json
+   :target: https://github.com/astral-sh/ruff
+   :alt: Ruff
 
 ----
 
@@ -64,12 +70,3 @@ An identical way to fetch the same file is through krank's top-level interface,
    import krank
 
    df = krank.fetch_lexicon("threat")
-
-
-
-Contributing
-------------
-
-Currently, Krank is developed mostly for my own personal desires for more convenient data access, more convenient data storage, a better ability to see a high-level view of all the datasets available for my projects, and better version-control over the public datasets I use and create. Making a Python package with documentation to view the datasets seemed like the best way to do this.
-
-That being said, if anyone else finds it useful and wants to contribute, please do. Feel free to post questions, bugs, feature requests, new dataset proposals, or even new module proposals on the `Krank GitHub Issues page <https://github.com/remrama/krank/issues>`_.
diff --git a/docs/api.rst b/docs/api.rst
@@ -34,8 +34,10 @@ LIWC
    :toctree: generated/
    :nosignatures:
 
+   krank.liwc.fetch_bainbridge2023
    krank.liwc.fetch_barrett2020
    krank.liwc.fetch_cariola2010
+   krank.liwc.fetch_pearson2023
 
 
 Tables

diff --git a/docs/conf.py b/docs/conf.py
@@ -132,7 +132,7 @@
     # "sidebarwidth": 230,
     # "navbar_start": ["navbar-logo", "version-switcher"],
     "show_version_warning_banner": True,
-    "announcement": "BEWARE! This project is in the <a href='https://github.com/remrama/krank'>planning stage</a>.",
+    "announcement": "BEWARE! <a href='https://github.com/remrama/krank'>This project</a> is in the planning stage. DO NOT USE!",
     "navbar_align": "left",  # [left, content, right] For testing that the navbar items align properly
     "show_nav_level": 3,
     "show_toc_level": 3,

diff --git a/docs/index.rst b/docs/index.rst
@@ -1,9 +1,8 @@
 
 .. image:: https://badge.fury.io/py/krank.svg
    :target: https://badge.fury.io/py/krank
+   :alt: PyPI
 
-.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
-   :target: https://github.com/psf/black
 
 ----
 
@@ -18,11 +17,19 @@ Fetch psychology datasets from remote sources.
 
 See Krank's :doc:`api` for a complete list of currently available functions.
 
+Feel free to post questions, bugs, feature requests, new dataset proposals, or even new module proposals on the `Krank GitHub Issues page <https://github.com/remrama/krank/issues>`_. See :doc:`contributing` for more details.
 
-.. warning::
 
-   This project is in the planning stage of development. Don't use it.
+.. danger::
 
+   This project is in the planning stage of development. DO NOT USE!
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Contents
+
+   api.rst
 
 
 Installation
@@ -64,12 +71,3 @@ An identical way to fetch the same file is through krank's top-level interface,
    import krank
 
    df = krank.fetch_lexicon("threat")
-
-
-
-Contributing
-------------
-
-Currently, Krank is developed mostly for my own personal desires for more convenient data access, more convenient data storage, a better ability to see a high-level view of all the datasets available for my projects, and better version-control over the public datasets I use and create. Making a Python package with documentation to view the datasets seemed like the best way to do this.
-
-That being said, if anyone else finds it useful and wants to contribute, please do. Feel free to post questions, bugs, feature requests, new dataset proposals, or even new module proposals on the `Krank GitHub Issues page <https://github.com/remrama/krank/issues>`_.
diff --git a/src/krank/data/liwc.json b/src/krank/data/liwc.json
@@ -0,0 +1,26 @@
+{
+    "andrewshanna2022": {
+        "v0": {
+            "url": "https://osf.io/download/z8ha4?view_only=449d1ed6f90e4651badc0b47a9302ae5",
+            "mdf": "39023fe7f50c429d5273c63eb9c92f3f"
+        }
+    },
+    "bainbridge2023": {
+        "v1": {
+            "study1": {
+                "url": "https://raw.githubusercontent.com/conbainbridge/covid_thoughts/CT_public_repo_1.0/data/study1_LIWC_all.csv",
+                "md5": "76b8ac3eafa03d5f73b932ed4a683cf0"
+            },
+            "study2": {
+                "url": "https://raw.githubusercontent.com/conbainbridge/covid_thoughts/CT_public_repo_1.0/data/study2_LIWC_all.csv",
+                "md5": "e923075330339f49ecb7bea48cab43f1"
+            }
+        }
+    },
+    "pearson2023": {
+        "v0": {
+            "url": "https://osf.io/download/u6kdv",
+            "md5": "f5a8aa1526bb2a4d4374ec03ab3284b3"
+        }
+    }
+}
diff --git a/src/krank/liwc.py b/src/krank/liwc.py
@@ -23,6 +23,63 @@
 # Specific fetching functions
 ################################################################################
 
+# def fetch_andrewshanna2022(version=None, load=True, target_dic=None, **kwargs):
+#     """
+#     Andrews-Hanna, Woo, Wilcox, Eisenbarth, Kim, Han, Losin, and Wager, 2022, *Journal of Experimental Psychology: General*,
+#     The conceptual building blocks of everyday thought: Tracking the emergence and dynamics of ruminative and nonruminative thinking,
+#     doi:`10.1037/xge0001096 <https://doi.org/10.1037/xge0001096>`_
+
+#     * **Source repository:** `OSF <https://osf.io/j5vn2/?view_only=449d1ed6f90e4651badc0b47a9302ae5>`_
+#     * **Source file:** `Andrews-Hanna_Woo_et_al_manuscript_data.csv <https://osf.io/z8ha4?view_only=449d1ed6f90e4651badc0b47a9302ae5>`_
+#     * **Reference:** `10.1037/xge0001096 <https://doi.org/10.1037/xge0001096>`_
+#     """
+#     def _load(filepath):
+#         conditions = ["Thoughts", "FAST"]
+#         df = (
+#             pd.read_csv(filepath).dropna(axis=1, how="all")  # last col `Unnamed: 95` and empty
+#         # meta_cols = ['Gender', 'Ethnicity_A', 'Age']
+#             pd.wide_to_long(df, conditions, j="LIWC", i="Subject_ID", sep="_LIWC_", suffix=r"\w+")
+#             .groupby("LIWC")[["Thoughts", "FAST"]].mean().T.rename_axis("condition").rename_axis(None, axis=1)
+#         )
+#         df.info()
+#         df.describe()
+#         # pd.wide_to_long(
+#         #     df.set_index("Subject_ID").filter(like="LIWC").reset_index(),
+#         #     ["Thoughts", "FAST"], j="LIWC", i="Subject_ID", sep="_LIWC_", suffix=r"\w+",
+#         # ).rename_axis("condition", axis=1).stack().unstack("LIWC").groupby("condition").mean()
+#     fp = retrieve(name, **kwargs)
+#     if not load:
+#         return fp
+#     return load(df) if callable(load) else _load(df)
+
+def fetch_bainbridge2023(version=None, load=True, target_dic=None, **kwargs):
+    """
+    LIWC scores from Bainbridge & Dale, 2023, *PLOS One*,
+    Thinking about life in COVID-19: An exploratory study on the influence of temporal framing on streams-of-consciousness,
+    doi:`10.1371/journal.pone.0285200 <https://doi.org/10.1371/journal.pone.0285200>`_
+
+    Subject-level LIWC scores are available from both studies in a
+    `GitHub repository <https://github.com/conbainbridge/covid_thoughts>`_.
+
+    Version control comes from GitHub Releases.
+    Version 1.0 is tagged ``CT_public_repo_1.0``
+    (`Release <https://github.com/conbainbridge/covid_thoughts/tree/CT_public_repo_1.0>`_,
+    `Repo <https://github.com/conbainbridge/covid_thoughts/releases/tag/CT_public_repo_1.0>`_).
+    """
+    fp1 = pooch.retrieve("study1", **kwargs)
+    fp2 = pooch.retrieve("study2", **kwargs)
+    def _load(fp):
+        """Custom loader that gets applied to both files."""
+        df = pd.read_csv(fp, index_col=0, usecols=lambda c: c != "Segment").astype("float")
+        index = df.index.str.extract(r"(?P<subject>\d{3})_(?P<cond1>\w{4})_(?P<cond2>\w+)\.txt")
+        index.insert(0, "study", "study1" if "study1" in fp else "study2")
+        df.index = pd.MultiIndex.from_frame(index)
+        return df
+    if not load:
+        return fp1, fp2
+    load_func = load if callable(load) else _load
+    df = pd.concat([_load(fp1), _load(fp2)], axis=0, sort=True).sort_index(axis=0)
+    return df
 
 def fetch_barrett2020(version=None, load=True, target_dic=None, **kwargs):
     """
@@ -158,6 +215,30 @@ def fetch_cariola2010(version=None, load=True, target_dic=None, **kwargs):
     df = sanitize_dataframe(df)
     return df
 
+
+def fetch_pearson2023(version=None, load=True, target_dic=None, **kwargs):
+    """
+    Pearson, Graff, Bai, Jakubowski, and Belfi, 2023, **Memory**,
+    Differences in autobiographical memories reported using text and voice during everyday life,
+    doi:`10.1080/09658211.2022.2162084 <https://doi.org/10.1080/09658211.2022.2162084>`_
+
+    * **Source repository:** `OSF <https://osf.io/2ykx5>`_
+    * **Source file:** `all_data.xlsx <https://osf.io/u6kdv>`_
+    * **Source citation: `10.1080/09658211.2022.2162084 <https://doi.org/10.1080/09658211.2022.2162084>`_
+    """
+    fp = pooch.retrieve("pearson2023", version=version, **kwargs)
+    if not load:
+        return fp
+    elif callable(load):
+        return load(fp)
+    # subject_col = "SubID"
+    meta_cols = ["MemNum", "Group"]
+    liwc_cols = ['WC', 'affect', 'social', 'cogproc', "percept"]
+    # use_cols = [subject_col] + meta_cols + liwc_cols
+    df = pd.read_excel(fp, usecols=meta_cols+liwc_cols)
+    df = df.groupby(meta_cols).mean().astype("float").sort_index(axis=0).sort_index(axis=1)
+    return df
+
 # def fetch_hawkins2017(, load=True):
 
 # def load_hawkins():