Prepare Q001, Q021, Q022, Q031, Q040

19cd7c1c · Dworatzyk, Katharina · ab4564b9 · 19cd7c1c · 19cd7c1c
Commit 19cd7c1c authored 3 years ago by Dworatzyk, Katharina
--- a/.gitignore
+++ b/.gitignore
+.idea/
+raw-data-hifis-survey-2021.csv
\ No newline at end of file
--- a/data-preparation.py
+++ b/data-preparation.py
+from typing import Set, Dict, List
+
+import pandas as pd
+
+def _load_data(path: str) -> pd.DataFrame:
+    """Load data from LimeSurvey export with the following settings:
+        - as CSV file
+        - question and answer codes
+        - use Expression Manager"""
+    data_df: pd.DataFrame = pd.read_csv(filepath_or_buffer=path, sep=",")
+    # set LimeSurvey IDs as index
+    data_df.set_index(keys="id", drop=True, inplace=True)
+    return data_df
+
+
+def _prepare_q001(data_df: pd.DataFrame) -> pd.DataFrame:
+    """Categorize affiliation mentioned in free text answers keeping "Other"
+    as default category."""
+    q001_oth_map: Dict = {
+        "Baumanagement im DLR": "A005",
+        "DLR": "A005",
+        "Helmholtz Auslandbüro": "A011",
+        "HMC im GFZ": "A009"
+    }
+
+    data_df.loc[:, "Q001_other"].replace(to_replace=q001_oth_map, inplace=True)
+    data_df.loc[:, "Q001"].update(data_df.loc[:, "Q001_other"])
+    data_df.loc[:, "Q001_other"] = None
+    return data_df
+
+
+def _prepare_q010(data_df: pd.DataFrame) -> pd.DataFrame:
+    return data_df
+
+
+def _prepare_q011(data_df: pd.DataFrame) -> pd.DataFrame:
+    return data_df
+
+
+def _prepare_q013(data_df: pd.DataFrame) -> pd.DataFrame:
+    return data_df
+
+
+def _prepare_q014(data_df: pd.DataFrame) -> pd.DataFrame:
+    return data_df
+
+
+def _prepare_q021(data_df: pd.DataFrame) -> pd.DataFrame:
+    """Categorize programming languages mentioned in free text field.
+    If programming language is mentioned less than 3 times assign "Other" as
+    default category."""
+    # disentangle answers of free text field
+    free_text_answers = data_df.loc[:, "Q021_other"] \
+        .dropna() \
+        .str.lower() \
+        .str.split(",") \
+        .explode("Q021_other") \
+        .str.strip()
+
+    # unique values
+    free_text_answer_options: Set = set(free_text_answers)
+
+    # new categories where answers are mentioned at least 3 times
+    new_subquestion_labels: List = [free_text_answer_option for
+                                    free_text_answer_option in
+                                    free_text_answer_options if
+                                    free_text_answers.isin(
+                                        [free_text_answer_option]).sum() >= 3]
+
+    # question with sub-questions
+    old_subquestion_ids: List = [column for column in data_df.columns if
+                                 column.startswith("Q021")]
+
+    # create ids for new categories
+    new_subquestion_ids: List = ["Q021_SQ" + str(i).zfill(3) for i in
+                                 range(len(old_subquestion_ids),
+                                       len(old_subquestion_ids) + len(
+                                           new_subquestion_labels))]
+
+    # check new sub-question where answer option was given in free text field
+    before_other = data_df.columns.get_loc("Q021_other")
+    for (new_subquestion_id, new_subquestion_label) in dict(
+            zip(new_subquestion_ids, new_subquestion_labels)).items():
+        data_df.insert(
+            loc=before_other, column=new_subquestion_id, value=None
+        )  # new column for new variable
+        mask = data_df.loc[:, "Q021_other"].str.contains(
+            new_subquestion_label, case=False, na=False
+        )  # find answers
+        data_df.loc[mask, new_subquestion_id] = "Y"  # answer code
+        data_df.loc[mask, "Q021_other"] = None  # remove from Other category
+        before_other = before_other + 1
+
+    # summarize remaining answers as "Other" category
+    other: Dict = {
+        "OPUS": "Y",
+        "Labtalk": "Y",
+        "Modelica": "Y",
+        "stan": "Y",
+        "AutoIt": "Y",
+        "REXX": "Y",
+        "Oracle*Forms": "Y",
+        "keine davon": None,
+        "Purescript": "Y",
+        "Scheme": "Y",
+        "SNL State Notation Language": "Y",
+        "Stata": "Y",
+        "html": "Y"
+    }
+    data_df.loc[:, "Q021_other"].replace(to_replace=other, inplace=True)
+
+    return data_df
+
+
+def _prepare_q022(data_df: pd.DataFrame) -> pd.DataFrame:
+    """Fix incorrect LimeSurvey answer codes."""
+    data_df.loc[:, "Q022"] = data_df.loc[:, "Q022"].str.replace(
+        pat="SQ",
+        repl="A"
+    )
+    return data_df
+
+
+def _prepare_q031(data_df: pd.DataFrame) -> pd.DataFrame:
+    """Mark "No answer" option as missing value."""
+    data_df.loc[:, ["Q031_SQ001", "Q031_SQ002"]] = \
+        data_df.loc[:, ["Q031_SQ001", "Q031_SQ002"]].replace(
+            to_replace="A000",
+            value=None
+        )
+    return data_df
+
+
+def _prepare_q040(data_df: pd.DataFrame) -> pd.DataFrame:
+    """Fix incorrect LimeSurvey sub-question codes."""
+    data_df.rename(
+        columns={"Q040_SQ002": "Q040_SQ003", "Q040_SQ003": "Q040_SQ002"},
+        inplace=True
+    )
+    return data_df
+
+
+data: pd.DataFrame = _load_data(path="./raw-data-hifis-survey-2021.csv")
+data = _prepare_q001(data_df=data)
+data = _prepare_q021(data_df=data)
+data = _prepare_q022(data_df=data)
+data = _prepare_q031(data_df=data)
+data = _prepare_q040(data_df=data)