Skip to content
Snippets Groups Projects
Commit 19cd7c1c authored by Dworatzyk, Katharina's avatar Dworatzyk, Katharina
Browse files

Prepare Q001, Q021, Q022, Q031, Q040

parent ab4564b9
No related branches found
No related tags found
1 merge request!7Resolve "Screen free text answers"
.idea/
raw-data-hifis-survey-2021.csv
\ No newline at end of file
from typing import Set, Dict, List
import pandas as pd
def _load_data(path: str) -> pd.DataFrame:
"""Load data from LimeSurvey export with the following settings:
- as CSV file
- question and answer codes
- use Expression Manager"""
data_df: pd.DataFrame = pd.read_csv(filepath_or_buffer=path, sep=",")
# set LimeSurvey IDs as index
data_df.set_index(keys="id", drop=True, inplace=True)
return data_df
def _prepare_q001(data_df: pd.DataFrame) -> pd.DataFrame:
"""Categorize affiliation mentioned in free text answers keeping "Other"
as default category."""
q001_oth_map: Dict = {
"Baumanagement im DLR": "A005",
"DLR": "A005",
"Helmholtz Auslandbüro": "A011",
"HMC im GFZ": "A009"
}
data_df.loc[:, "Q001_other"].replace(to_replace=q001_oth_map, inplace=True)
data_df.loc[:, "Q001"].update(data_df.loc[:, "Q001_other"])
data_df.loc[:, "Q001_other"] = None
return data_df
def _prepare_q010(data_df: pd.DataFrame) -> pd.DataFrame:
return data_df
def _prepare_q011(data_df: pd.DataFrame) -> pd.DataFrame:
return data_df
def _prepare_q013(data_df: pd.DataFrame) -> pd.DataFrame:
return data_df
def _prepare_q014(data_df: pd.DataFrame) -> pd.DataFrame:
return data_df
def _prepare_q021(data_df: pd.DataFrame) -> pd.DataFrame:
"""Categorize programming languages mentioned in free text field.
If programming language is mentioned less than 3 times assign "Other" as
default category."""
# disentangle answers of free text field
free_text_answers = data_df.loc[:, "Q021_other"] \
.dropna() \
.str.lower() \
.str.split(",") \
.explode("Q021_other") \
.str.strip()
# unique values
free_text_answer_options: Set = set(free_text_answers)
# new categories where answers are mentioned at least 3 times
new_subquestion_labels: List = [free_text_answer_option for
free_text_answer_option in
free_text_answer_options if
free_text_answers.isin(
[free_text_answer_option]).sum() >= 3]
# question with sub-questions
old_subquestion_ids: List = [column for column in data_df.columns if
column.startswith("Q021")]
# create ids for new categories
new_subquestion_ids: List = ["Q021_SQ" + str(i).zfill(3) for i in
range(len(old_subquestion_ids),
len(old_subquestion_ids) + len(
new_subquestion_labels))]
# check new sub-question where answer option was given in free text field
before_other = data_df.columns.get_loc("Q021_other")
for (new_subquestion_id, new_subquestion_label) in dict(
zip(new_subquestion_ids, new_subquestion_labels)).items():
data_df.insert(
loc=before_other, column=new_subquestion_id, value=None
) # new column for new variable
mask = data_df.loc[:, "Q021_other"].str.contains(
new_subquestion_label, case=False, na=False
) # find answers
data_df.loc[mask, new_subquestion_id] = "Y" # answer code
data_df.loc[mask, "Q021_other"] = None # remove from Other category
before_other = before_other + 1
# summarize remaining answers as "Other" category
other: Dict = {
"OPUS": "Y",
"Labtalk": "Y",
"Modelica": "Y",
"stan": "Y",
"AutoIt": "Y",
"REXX": "Y",
"Oracle*Forms": "Y",
"keine davon": None,
"Purescript": "Y",
"Scheme": "Y",
"SNL State Notation Language": "Y",
"Stata": "Y",
"html": "Y"
}
data_df.loc[:, "Q021_other"].replace(to_replace=other, inplace=True)
return data_df
def _prepare_q022(data_df: pd.DataFrame) -> pd.DataFrame:
"""Fix incorrect LimeSurvey answer codes."""
data_df.loc[:, "Q022"] = data_df.loc[:, "Q022"].str.replace(
pat="SQ",
repl="A"
)
return data_df
def _prepare_q031(data_df: pd.DataFrame) -> pd.DataFrame:
"""Mark "No answer" option as missing value."""
data_df.loc[:, ["Q031_SQ001", "Q031_SQ002"]] = \
data_df.loc[:, ["Q031_SQ001", "Q031_SQ002"]].replace(
to_replace="A000",
value=None
)
return data_df
def _prepare_q040(data_df: pd.DataFrame) -> pd.DataFrame:
"""Fix incorrect LimeSurvey sub-question codes."""
data_df.rename(
columns={"Q040_SQ002": "Q040_SQ003", "Q040_SQ003": "Q040_SQ002"},
inplace=True
)
return data_df
data: pd.DataFrame = _load_data(path="./raw-data-hifis-survey-2021.csv")
data = _prepare_q001(data_df=data)
data = _prepare_q021(data_df=data)
data = _prepare_q022(data_df=data)
data = _prepare_q031(data_df=data)
data = _prepare_q040(data_df=data)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment