From fdf438712f976c9ca7c1659166e2ccf77150f97e Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 24 Nov 2021 16:39:11 +0100
Subject: [PATCH 01/13] Re-add services script after merge

---
 scripts/hifis-software-services.py | 197 +++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 scripts/hifis-software-services.py

diff --git a/scripts/hifis-software-services.py b/scripts/hifis-software-services.py
new file mode 100644
index 00000000..ae3d9bd3
--- /dev/null
+++ b/scripts/hifis-software-services.py
@@ -0,0 +1,197 @@
+from typing import List
+
+import modules.descriptive_stats as stats
+import pandas as pd
+from hifis_surveyval.data_container import DataContainer
+from hifis_surveyval.hifis_surveyval import HIFISSurveyval
+
+
+def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
+    """Execute hifis software services script."""
+
+    def run_analysis(data_set: pd.DataFrame, id_list: List,
+                     tbl: pd.DataFrame, compact_format) -> pd.DataFrame:
+        """Compute descriptive statistics for different scale levels and
+        response formats and compile a table containing these statistics."""
+        # for each specified item
+        for q_id in id_list:
+            # if data are categorical and answers were given as single choice
+            if (q_id == "Q005" or q_id == "Q006" or q_id == "Q007-T"
+                    or q_id == "Q035"):
+                # compute relative frequencies for question
+                [n, stats_tbl, unit] = stats.nominal_single_choice(
+                    variable=q_id[:4] + "/_",
+                    meta=data,
+                    data=data_set[(q_id, "_")]
+                )
+            # if data are categorical and answers were given as multiple choice
+            elif q_id == "Q037":
+                # compute relative frequencies for question collection
+                [n, stats_tbl, unit] = stats.nominal_multiple_choice(
+                    variable=q_id,
+                    meta=data,
+                    data=data_set[q_id]
+                )
+            # if data are categorical and answers were given as single choice
+            # for multiple questions (array)
+            elif q_id == "Q036":
+                # Compute relative frequencies for question collection
+                [n, stats_tbl, unit] = stats.nominal_array(
+                    variable=q_id,
+                    meta=data,
+                    data=data_set[q_id]
+                )
+            # if data are numerical and answers were given as single choice
+            elif q_id == "Q007":
+                # Compute basic descriptive statistics for question
+                [n, stats_tbl, unit] = stats.scale_single_choice(
+                    variable=q_id[:4] + "/_",
+                    meta=data,
+                    data=data_set[(q_id, "_")],
+                    custom_stats=["mean", "std"]
+                )
+            else:
+                print("You did not specify any analysis for this item: "
+                      + q_id)
+                continue
+
+            # add individual item statistics to statistics table
+            # (vertical order)
+            tbl = stats.add_to_table(
+                tbl=tbl,
+                variable=q_id[:4],
+                meta=data,
+                count=n,
+                stats=stats_tbl,
+                unit=unit,
+                compact=compact_format
+            )
+
+        return tbl
+
+    # output path
+    path = str(hifis_surveyval.settings.ANALYSIS_OUTPUT_PATH) + "\\"
+
+    # items relevant to describe aspects of software development at HGF
+    # and needs regarding software development support offerings
+    ids: List = ["Q006", "Q005", "Q007", "Q035", "Q036", "Q037"]
+
+    # define Helmholtz centre as grouping variable
+    grouping_variable: str = "Q001/_"
+
+    # load data for the specified questions
+    software_services_data: pd.DataFrame = data.data_frame_for_ids(
+        ids + [grouping_variable]
+    )
+
+    # transform interval variable Q007 into categorical variable using
+    # quantiles for binning
+    software_services_data.loc[:, "Q007-T/_"] = pd.cut(
+        x=software_services_data["Q007/_"],
+        bins=[0, 25, 50, 75, 100],
+        right=True,
+        labels=["0-25%", "26-50%", "51-75%", "76-100%"],
+        include_lowest=True
+    )
+
+    # add transformed variable to list of questions
+    ids = ids + ["Q007-T"]
+    ids.sort()
+
+    #TODO: Fix order and replace Q006/A006 with missing value
+
+    # set multi-index
+    software_services_data.columns = \
+        software_services_data.columns.str.split('/', expand=True)
+
+    # create statistics table containing total number of cases
+    software_services_tbl: pd.DataFrame = \
+        stats.get_sample_size(software_services_data)
+
+    # compute descriptive statistics
+    software_services_tbl = run_analysis(
+        data_set=software_services_data,
+        id_list=ids,
+        tbl=software_services_tbl,
+        compact_format=False
+    )
+
+    # list all Helmholtz centres (not including Others)
+    groups: List = [
+        answer_option.label for answer_option in
+        data.question_for_id(grouping_variable)._answer_options.values()
+    ][:-1]
+
+    # create new table for grouped sample characteristics
+    grouped_software_services_tbl: pd.DataFrame = pd.DataFrame()
+
+    # for each Helmholtz centre
+    for group in groups:
+        # get the centre-specific data set
+        group_data: pd.DataFrame = software_services_data[
+            software_services_data[grouping_variable[:4]].values == group
+            ]
+        # compute descriptive statistics for each centre separately
+        centre_software_services_tbl: pd.DataFrame = run_analysis(
+            data_set=group_data,
+            id_list=ids,
+            tbl=pd.DataFrame(columns=[""]),
+            compact_format=True
+        )
+        # set centre as header
+        centre_software_services_tbl.columns = [group]
+        # add centre-specific statistics to grouped sample table
+        # accounting for identical indices by resetting index of individual tbl
+        index: pd.Index = centre_software_services_tbl.index
+        grouped_software_services_tbl = grouped_software_services_tbl.join(
+            other=centre_software_services_tbl.reset_index(drop=True),
+            how="right",
+            sort=False
+        )
+    grouped_software_services_tbl.index = index
+
+    # display results
+    print("\n<< -- HIFIS software services -- >>")
+    print("\nTotal sample: \n", software_services_tbl)
+    print("\nGrouped per Helmholtz centre: \n", grouped_software_services_tbl)
+
+    # export results
+    software_services_tbl.to_latex(
+        buf=path + "software-services.tex",
+        header=True,
+        index=True,
+        na_rep="",
+        float_format="{:0.2f}".format,
+        column_format="@{}p{0.9\\textwidth}r@{}r@{}",
+        caption="Software services: status quo",
+        label="tab:software_services_statistics",
+        position="htbp",
+        longtable=True
+    )
+
+    grouped_software_services_tbl.to_latex(
+        buf=path + "software-services-grouped.tex",
+        header=True,
+        index=True,
+        na_rep="",
+        float_format="{:0.0f}".format,
+        column_format="@{}p{0.25\\textwidth}rrrrrrrrrrrrrrrrrrr@{}",
+        caption="Software services per Helmholtz centre",
+        label="tab:software_services_statistics_grouped",
+        position="htbp",
+        longtable=True
+    )
+
+    # generate grouped box plot for interval variable(s)
+    stats.grouped_box_plot(
+        variable="Q007",
+        g_variable=grouping_variable[:4],
+        meta=data,
+        data=software_services_data[[
+            grouping_variable[:4], "Q007"
+        ]].droplevel(axis=1, level=1),
+        unit="\\%",
+        path=path
+    )
+
+    # TODO: Add plots
\ No newline at end of file
-- 
GitLab


From 008196415c796176d8effc8f014f12a5386d25a3 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 24 Nov 2021 16:48:07 +0100
Subject: [PATCH 02/13] Integrate updates from community branch

---
 scripts/modules/descriptive_stats.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/modules/descriptive_stats.py b/scripts/modules/descriptive_stats.py
index cf4a5efe..a1a6fe30 100644
--- a/scripts/modules/descriptive_stats.py
+++ b/scripts/modules/descriptive_stats.py
@@ -5,10 +5,11 @@ import matplotlib.pyplot as plt
 from hifis_surveyval.core import util
 from hifis_surveyval.data_container import Question, QuestionCollection
 from hifis_surveyval.hifis_surveyval import HIFISSurveyval
-from numpy import array
+from numpy import nan, array
 from pandas import DataFrame, Series, set_option
 from wordcloud import WordCloud
 
+
 # Settings for proper displaying (e.g. question text will be cut off otherwise)
 set_option("display.max_columns", None)
 set_option("display.max_rows", None)
-- 
GitLab


From 29e34b723c67df61fec264bf367545046cbff2bb Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Mon, 29 Nov 2021 14:20:20 +0100
Subject: [PATCH 03/13] Re-write hifis services script

---
 scripts/hifis-software-services.py   | 295 +++++++++++----------------
 scripts/modules/descriptive_stats.py |   8 +
 2 files changed, 127 insertions(+), 176 deletions(-)

diff --git a/scripts/hifis-software-services.py b/scripts/hifis-software-services.py
index ae3d9bd3..6f7ff66e 100644
--- a/scripts/hifis-software-services.py
+++ b/scripts/hifis-software-services.py
@@ -1,197 +1,140 @@
-from typing import List
+from typing import Dict, Set
 
 import modules.descriptive_stats as stats
-import pandas as pd
-from hifis_surveyval.data_container import DataContainer
+from pandas import DataFrame
+from hifis_surveyval.data_container import DataContainer, Question, \
+    QuestionCollection
 from hifis_surveyval.hifis_surveyval import HIFISSurveyval
 
 
 def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
-    """Execute hifis software services script."""
-
-    def run_analysis(data_set: pd.DataFrame, id_list: List,
-                     tbl: pd.DataFrame, compact_format) -> pd.DataFrame:
-        """Compute descriptive statistics for different scale levels and
-        response formats and compile a table containing these statistics."""
-        # for each specified item
-        for q_id in id_list:
-            # if data are categorical and answers were given as single choice
-            if (q_id == "Q005" or q_id == "Q006" or q_id == "Q007-T"
-                    or q_id == "Q035"):
-                # compute relative frequencies for question
-                [n, stats_tbl, unit] = stats.nominal_single_choice(
-                    variable=q_id[:4] + "/_",
-                    meta=data,
-                    data=data_set[(q_id, "_")]
-                )
-            # if data are categorical and answers were given as multiple choice
-            elif q_id == "Q037":
-                # compute relative frequencies for question collection
-                [n, stats_tbl, unit] = stats.nominal_multiple_choice(
-                    variable=q_id,
-                    meta=data,
-                    data=data_set[q_id]
-                )
-            # if data are categorical and answers were given as single choice
-            # for multiple questions (array)
-            elif q_id == "Q036":
-                # Compute relative frequencies for question collection
-                [n, stats_tbl, unit] = stats.nominal_array(
-                    variable=q_id,
-                    meta=data,
-                    data=data_set[q_id]
-                )
-            # if data are numerical and answers were given as single choice
-            elif q_id == "Q007":
-                # Compute basic descriptive statistics for question
-                [n, stats_tbl, unit] = stats.scale_single_choice(
-                    variable=q_id[:4] + "/_",
-                    meta=data,
-                    data=data_set[(q_id, "_")],
-                    custom_stats=["mean", "std"]
-                )
-            else:
-                print("You did not specify any analysis for this item: "
-                      + q_id)
-                continue
-
-            # add individual item statistics to statistics table
-            # (vertical order)
-            tbl = stats.add_to_table(
-                tbl=tbl,
-                variable=q_id[:4],
-                meta=data,
-                count=n,
-                stats=stats_tbl,
-                unit=unit,
-                compact=compact_format
-            )
-
-        return tbl
-
-    # output path
-    path = str(hifis_surveyval.settings.ANALYSIS_OUTPUT_PATH) + "\\"
-
-    # items relevant to describe aspects of software development at HGF
-    # and needs regarding software development support offerings
-    ids: List = ["Q006", "Q005", "Q007", "Q035", "Q036", "Q037"]
-
-    # define Helmholtz centre as grouping variable
-    grouping_variable: str = "Q001/_"
-
-    # load data for the specified questions
-    software_services_data: pd.DataFrame = data.data_frame_for_ids(
-        ids + [grouping_variable]
-    )
+    """
+    Runs descriptive analysis for different questions of the HIFIS services
+    section of the HIFIS survey.
+    """
+    # Relevant questions and corresponding analysis
+    questions: Dict = {
+        "Q035/_": _hifis_services_used,
+        "Q036": _language_preference,
+        "Q037": _hifis_services_announcement
+    }
+
+    # Create statistics table containing total number of cases
+    hifis_services_sample: DataFrame = data.data_frame_for_ids(
+            requested_ids=list(questions.keys())
+        ).dropna(how="all")
+    hifis_services_tbl: DataFrame = \
+        stats.get_sample_size(hifis_services_sample)
+
+    # Exclude cases from analysis who did not answer any community question
+    incomplete_cases: Set = set(data.data_frame_for_ids(
+        requested_ids=data.question_collection_ids
+    ).index).difference(set(hifis_services_sample.index))
+
+    # Run the respective analysis for each of the specified questions
+    # and save descriptive statistics in joint table
+    for question_id, analysis in questions.items():
+        hifis_services_tbl = analysis(
+            hifis_surveyval=hifis_surveyval,
+            data=data,
+            question_id=question_id,
+            stats_tbl=hifis_services_tbl,
+            cases=incomplete_cases
+        )
 
-    # transform interval variable Q007 into categorical variable using
-    # quantiles for binning
-    software_services_data.loc[:, "Q007-T/_"] = pd.cut(
-        x=software_services_data["Q007/_"],
-        bins=[0, 25, 50, 75, 100],
-        right=True,
-        labels=["0-25%", "26-50%", "51-75%", "76-100%"],
-        include_lowest=True
+    # Display and export resulting table of sample characteristics
+    print("\nHIFIS survey community:\n{}\n"
+          .format(hifis_services_tbl))
+
+
+def _hifis_services_used(hifis_surveyval: HIFISSurveyval, data: DataContainer,
+                         question_id: str, stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on use of HIFIS services
+    question: Question = data.question_for_id(question_id)
+    question.remove_answers(cases)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_single_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+         tbl=stats_tbl,
+         question=question,
+         count=n,
+         stats=question_freq_rel,
+         unit=unit,
+         compact=False
     )
 
-    # add transformed variable to list of questions
-    ids = ids + ["Q007-T"]
-    ids.sort()
+    return stats_tbl
 
-    #TODO: Fix order and replace Q006/A006 with missing value
 
-    # set multi-index
-    software_services_data.columns = \
-        software_services_data.columns.str.split('/', expand=True)
+def _language_preference(hifis_surveyval: HIFISSurveyval, data: DataContainer,
+                         question_id: str, stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on preferred language for different services
+    question: QuestionCollection = data.collection_for_id(question_id)
+    question.remove_answers(cases)
 
-    # create statistics table containing total number of cases
-    software_services_tbl: pd.DataFrame = \
-        stats.get_sample_size(software_services_data)
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_array(question)
 
-    # compute descriptive statistics
-    software_services_tbl = run_analysis(
-        data_set=software_services_data,
-        id_list=ids,
-        tbl=software_services_tbl,
-        compact_format=False
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+        tbl=stats_tbl,
+        question=question,
+        count=n,
+        stats=question_freq_rel,
+        unit=unit,
+        compact=False
     )
 
-    # list all Helmholtz centres (not including Others)
-    groups: List = [
-        answer_option.label for answer_option in
-        data.question_for_id(grouping_variable)._answer_options.values()
-    ][:-1]
-
-    # create new table for grouped sample characteristics
-    grouped_software_services_tbl: pd.DataFrame = pd.DataFrame()
-
-    # for each Helmholtz centre
-    for group in groups:
-        # get the centre-specific data set
-        group_data: pd.DataFrame = software_services_data[
-            software_services_data[grouping_variable[:4]].values == group
-            ]
-        # compute descriptive statistics for each centre separately
-        centre_software_services_tbl: pd.DataFrame = run_analysis(
-            data_set=group_data,
-            id_list=ids,
-            tbl=pd.DataFrame(columns=[""]),
-            compact_format=True
-        )
-        # set centre as header
-        centre_software_services_tbl.columns = [group]
-        # add centre-specific statistics to grouped sample table
-        # accounting for identical indices by resetting index of individual tbl
-        index: pd.Index = centre_software_services_tbl.index
-        grouped_software_services_tbl = grouped_software_services_tbl.join(
-            other=centre_software_services_tbl.reset_index(drop=True),
-            how="right",
-            sort=False
-        )
-    grouped_software_services_tbl.index = index
-
-    # display results
-    print("\n<< -- HIFIS software services -- >>")
-    print("\nTotal sample: \n", software_services_tbl)
-    print("\nGrouped per Helmholtz centre: \n", grouped_software_services_tbl)
-
-    # export results
-    software_services_tbl.to_latex(
-        buf=path + "software-services.tex",
-        header=True,
-        index=True,
-        na_rep="",
-        float_format="{:0.2f}".format,
-        column_format="@{}p{0.9\\textwidth}r@{}r@{}",
-        caption="Software services: status quo",
-        label="tab:software_services_statistics",
-        position="htbp",
-        longtable=True
+    # Rearrange statistics data frame for plotting
+    stats_to_plot = DataFrame()
+    stats_to_plot = stats_to_plot.append(question_freq_rel[1:4].T)
+    stats_to_plot = stats_to_plot.append(question_freq_rel[5:8].T)
+    stats_to_plot = stats_to_plot.append(question_freq_rel[9:12].T)
+    stats_to_plot = stats_to_plot.append(question_freq_rel[13:16].T)
+    stats_to_plot = stats_to_plot.append(question_freq_rel[17:].T)
+    stats_to_plot.index = question_freq_rel.index.values[[0, 4, 8, 12, 16]]
+
+    # Plot preferred language for each HIFIS software service
+    stats.bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        stats_to_plot=stats_to_plot.astype(float, errors="raise")
     )
 
-    grouped_software_services_tbl.to_latex(
-        buf=path + "software-services-grouped.tex",
-        header=True,
-        index=True,
-        na_rep="",
-        float_format="{:0.0f}".format,
-        column_format="@{}p{0.25\\textwidth}rrrrrrrrrrrrrrrrrrr@{}",
-        caption="Software services per Helmholtz centre",
-        label="tab:software_services_statistics_grouped",
-        position="htbp",
-        longtable=True
+    return stats_tbl
+
+
+def _hifis_services_announcement(hifis_surveyval: HIFISSurveyval,
+                                 data: DataContainer, question_id: str,
+                                 stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on preferred communication channels for HIFIS services updates
+    question: QuestionCollection = data.collection_for_id(question_id)
+    question.remove_answers(cases)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+        tbl=stats_tbl,
+        question=question,
+        count=n,
+        stats=question_freq_rel,
+        unit=unit,
+        compact=False
     )
 
-    # generate grouped box plot for interval variable(s)
-    stats.grouped_box_plot(
-        variable="Q007",
-        g_variable=grouping_variable[:4],
-        meta=data,
-        data=software_services_data[[
-            grouping_variable[:4], "Q007"
-        ]].droplevel(axis=1, level=1),
-        unit="\\%",
-        path=path
+    # Plot preferred communication channels
+    stats.bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        stats_to_plot=question_freq_rel.astype(float, errors="raise")
     )
 
-    # TODO: Add plots
\ No newline at end of file
+    return stats_tbl
diff --git a/scripts/modules/descriptive_stats.py b/scripts/modules/descriptive_stats.py
index a1a6fe30..d13a3f5d 100644
--- a/scripts/modules/descriptive_stats.py
+++ b/scripts/modules/descriptive_stats.py
@@ -303,6 +303,14 @@ def bar_plot(hifis_surveyval: HIFISSurveyval, stats_to_plot: DataFrame,
         # Do not show legend with answer labels
         show_legend: bool = kwargs.get("legend", False)
 
+    # For array questions
+    if stats_to_plot.shape[1] > 1 :
+        # Use bar for each sub-question
+        stacked = True
+
+        # Show legend with answer labels
+        show_legend = True
+
     print(
         "\nBar plot for {}:\n"
         "Number of HIFIS survey respondents (n={}):\n{}".format(
-- 
GitLab


From 374a96d9b925f094980690bc40b2e88ae8425139 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Mon, 29 Nov 2021 16:52:47 +0100
Subject: [PATCH 04/13] Fix some errors in plots

---
 scripts/hifis-software-services.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/hifis-software-services.py b/scripts/hifis-software-services.py
index 6f7ff66e..b7430668 100644
--- a/scripts/hifis-software-services.py
+++ b/scripts/hifis-software-services.py
@@ -103,7 +103,8 @@ def _language_preference(hifis_surveyval: HIFISSurveyval, data: DataContainer,
     stats.bar_plot(
         hifis_surveyval=hifis_surveyval,
         question=question,
-        stats_to_plot=stats_to_plot.astype(float, errors="raise")
+        stats_to_plot=stats_to_plot.astype(float, errors="raise"),
+        array_question=True
     )
 
     return stats_tbl
-- 
GitLab


From 65cc1816c7b2754f0e1ad6c1a7565368129715d5 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 1 Dec 2021 12:13:25 +0100
Subject: [PATCH 05/13] Consolidate changes

---
 scripts/modules/descriptive_stats.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/modules/descriptive_stats.py b/scripts/modules/descriptive_stats.py
index d13a3f5d..3ad12a76 100644
--- a/scripts/modules/descriptive_stats.py
+++ b/scripts/modules/descriptive_stats.py
@@ -105,7 +105,7 @@ def nominal_multiple_choice(question: QuestionCollection, **kwargs) \
     return n_valid_responses, stats_df, "%"
 
 
-def nominal_array(question: QuestionCollection) -> [int, DataFrame, str]:
+def nominal_array(question: QuestionCollection, ) -> [int, DataFrame, str]:
     """Return a table containing relative frequencies for each sub-question of
     a categorical variable if variable is an array question"""
     # Get data for question excluding non-respondents
@@ -304,12 +304,18 @@ def bar_plot(hifis_surveyval: HIFISSurveyval, stats_to_plot: DataFrame,
         show_legend: bool = kwargs.get("legend", False)
 
     # For array questions
-    if stats_to_plot.shape[1] > 1 :
+    if array_question:
         # Use bar for each sub-question
         stacked = True
 
         # Show legend with answer labels
         show_legend = True
+    else:
+        # Use bar for each answer option
+        stacked = False
+
+        # Do not show legend with answer labels
+        show_legend = False
 
     print(
         "\nBar plot for {}:\n"
-- 
GitLab


From e015969e38fc5e6d0f37ce9c0cd3ad6d7b062515 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 1 Dec 2021 12:22:28 +0100
Subject: [PATCH 06/13] Consolidate changes

---
 scripts/modules/descriptive_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/modules/descriptive_stats.py b/scripts/modules/descriptive_stats.py
index 3ad12a76..da01d3f8 100644
--- a/scripts/modules/descriptive_stats.py
+++ b/scripts/modules/descriptive_stats.py
@@ -105,7 +105,7 @@ def nominal_multiple_choice(question: QuestionCollection, **kwargs) \
     return n_valid_responses, stats_df, "%"
 
 
-def nominal_array(question: QuestionCollection, ) -> [int, DataFrame, str]:
+def nominal_array(question: QuestionCollection) -> [int, DataFrame, str]:
     """Return a table containing relative frequencies for each sub-question of
     a categorical variable if variable is an array question"""
     # Get data for question excluding non-respondents
-- 
GitLab


From b02a4fc169b2933872b10dbcd05bdd1a6d38eab0 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 1 Dec 2021 15:39:11 +0100
Subject: [PATCH 07/13] Consolidate changes

---
 scripts/example_plot.py              |   3 -
 scripts/hifis-software-community.py  | 388 +++++++++++++++++++++++++++
 scripts/modules/descriptive_stats.py |  14 -
 3 files changed, 388 insertions(+), 17 deletions(-)
 create mode 100644 scripts/hifis-software-community.py

diff --git a/scripts/example_plot.py b/scripts/example_plot.py
index dba21cc2..8433aa61 100644
--- a/scripts/example_plot.py
+++ b/scripts/example_plot.py
@@ -4,11 +4,8 @@ import modules.descriptive_stats as stats
 from hifis_surveyval.core import util
 from hifis_surveyval.data_container import DataContainer, Question
 from hifis_surveyval.hifis_surveyval import HIFISSurveyval
-from matplotlib import rc
 from pandas import DataFrame
 
-rc('text', usetex=True)
-
 
 def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
     """Generate example plots"""
diff --git a/scripts/hifis-software-community.py b/scripts/hifis-software-community.py
new file mode 100644
index 00000000..01d37af4
--- /dev/null
+++ b/scripts/hifis-software-community.py
@@ -0,0 +1,388 @@
+from typing import Dict, List, Set
+
+import modules.descriptive_stats as stats
+from pandas import DataFrame, Series
+from hifis_surveyval.data_container import DataContainer, Question, \
+    QuestionCollection
+from hifis_surveyval.hifis_surveyval import HIFISSurveyval
+
+
+def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
+    """
+    Compute descriptive statistics for WP Community specific questions to
+    analyze different aspects of respondent's interest in community events
+    """
+    # Output path
+    path: str = str(hifis_surveyval.settings.ANALYSIS_OUTPUT_PATH) + "\\"
+
+    # Relevant questions
+    questions: Dict = {
+        "Q008/_": _rse_identity,
+        "Q030": _community_events_interest,
+        "Q031": _community_events_satisfaction,
+        "Q032": _community_events_size,
+        "Q033": _community_events_participation
+    }
+
+    # Create statistics table containing total number of cases
+    community_sample: DataFrame = data.data_frame_for_ids(
+            requested_ids=list(questions.keys())
+        ).dropna(how="all")
+    community_tbl: DataFrame = stats.get_sample_size(community_sample)
+
+    # Exclude cases from analysis who did not answer any community question
+    incomplete_cases: Set = set(data.data_frame_for_ids(
+        requested_ids=data.question_collection_ids
+    ).index).difference(set(community_sample.index))
+
+    # Run the respective analysis for each of the specified questions
+    # and save descriptive statistics in joint table
+    for question_id, analysis in questions.items():
+        community_tbl = analysis(
+            hifis_surveyval=hifis_surveyval,
+            data=data,
+            question_id=question_id,
+            stats_tbl=community_tbl,
+            cases=incomplete_cases
+        )
+    community_tbl.rename({"mean": "M", "std": "SD"}, inplace=True)
+
+    # Display and export resulting table of sample characteristics
+    print("\nHIFIS survey community:\n{}\n"
+          .format(community_tbl))
+    community_tbl.to_latex(
+        buf=path + "community-statistics.tex",
+        header=True,
+        index=True,
+        na_rep="",
+        float_format="{:0.2f}".format,
+        column_format="@{}p{0.9\\textwidth}r@{}r@{}",
+        caption="Community",
+        label="tab:community_stats",
+        position="htbp",
+        longtable=False
+    )
+
+
+def _rse_identity(hifis_surveyval: HIFISSurveyval, data: DataContainer,
+                  question_id: str, stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on identification as research software engineer
+    question: Question = data.question_for_id(question_id)
+    question.remove_answers(cases)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_single_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+         tbl=stats_tbl,
+         question=question,
+         count=n,
+         stats=question_freq_rel,
+         unit=unit,
+         compact=False
+    )
+
+    return stats_tbl
+
+
+def _community_events_interest(hifis_surveyval: HIFISSurveyval,
+                                   data: DataContainer, question_id: str,
+                                   stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on general interest in community events
+    question: QuestionCollection = data.collection_for_id(question_id)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+         tbl=stats_tbl,
+         question=question,
+         count=n,
+         stats=question_freq_rel,
+         unit=unit,
+         compact=False
+    )
+
+    # Grouping variable
+    grouping: Question = data.question_for_id("Q008/_")
+    grouping.remove_answers(cases)
+    grouping_variable: Series = grouping.as_series()\
+        .replace({"Not sure": None})
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    question_variable: DataFrame = question.as_data_frame()
+    question_variable.drop(
+        columns=[column for column in question_variable.columns
+                 if column.endswith("comment")],
+        inplace=True
+    )
+    grouped_data: DataFrame = question_variable\
+        .groupby(grouping_variable)\
+        .sum()
+    grouped_data = grouped_data.divide(
+        grouped_data.sum(axis="columns"), axis="index"
+    ) * 100
+
+    # Replace sub-question ids with sub-question labels as column header
+    sub_questions: Dict = {
+        sub_question.full_id: sub_question.label for sub_question in
+        question._questions.values()
+    }
+    grouped_data.rename(columns=sub_questions,
+                        index={"No": "Non-RSE", "Yes": "RSE"},
+                        inplace=True)
+
+    # Plot general interest in community events grouped by RSE identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        grouping=grouping,
+        stats_to_plot=grouped_data.T
+    )
+
+    # Plot reasons for not participating in any community events
+    sub_question: Question = data.question_for_id("Q030/SQ001comment")
+    [_, sq001_freq_rel, _] = stats.nominal_single_choice(sub_question)
+    stats.bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        stats_to_plot=DataFrame(sq001_freq_rel)
+    )
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    sub_question_variable: Series = sub_question.as_series()
+    response_options: List[str] = [
+        answer_option.label for answer_option in
+        sub_question._answer_options.values()
+    ]
+    grouped_data: DataFrame = DataFrame(
+        sub_question_variable.groupby(grouping_variable).value_counts()
+    )
+    grouped_data = grouped_data.loc["No", :]\
+        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
+              rsuffix="_yes")\
+        .rename(columns={"Q030/SQ001comment_yes": "RSE",
+                         "Q030/SQ001comment_no": "Non-RSE"}) \
+        .fillna(value=0)
+    grouped_data = (grouped_data.divide(
+        grouped_data.sum(axis="index"), axis="columns"
+    ) * 100).reindex(response_options, fill_value=0)
+
+    # Plot reasons for not participating in any community events grouped by
+    # RSE identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        grouping=grouping,
+        stats_to_plot=grouped_data,
+        y_tick_labels=grouped_data.index
+    )
+
+    # Plot community events respondents have heard of
+    sub_question: Question = data.question_for_id("Q030/SQ003comment")
+    [_, sq003_freq_rel, _] = stats.nominal_single_choice(sub_question)
+    stats.bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        stats_to_plot=DataFrame(sq003_freq_rel)
+    )
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    sub_question_variable: Series = sub_question.as_series()
+    response_options: List[str] = [
+        answer_option.label for answer_option in
+        sub_question._answer_options.values()
+    ]
+    grouped_data: DataFrame = DataFrame(
+        sub_question_variable.groupby(grouping_variable).value_counts()
+    )
+    grouped_data = grouped_data.loc["No", :] \
+        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
+              rsuffix="_yes") \
+        .rename(columns={"Q030/SQ003comment_yes": "RSE",
+                         "Q030/SQ003comment_no": "Non-RSE"})\
+        .fillna(value=0)
+    grouped_data = (grouped_data.divide(
+        grouped_data.sum(axis="index"), axis="columns"
+    ) * 100).reindex(response_options, fill_value=0)
+
+    # Plot events respondents have heard of grouped by RSE identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        grouping=grouping,
+        stats_to_plot=grouped_data,
+        y_tick_labels=grouped_data.index
+    )
+
+    # Plot community events respondents participated in
+    sub_question: Question = data.question_for_id("Q030/SQ004comment")
+    [_, sq004_freq_rel, _] = stats.nominal_single_choice(sub_question)
+    stats.bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        stats_to_plot=DataFrame(sq004_freq_rel)
+    )
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    sub_question_variable: Series = sub_question.as_series()
+    response_options: List[str] = [
+        answer_option.label for answer_option in
+        sub_question._answer_options.values()
+    ]
+    grouped_data: DataFrame = DataFrame(
+        sub_question_variable.groupby(grouping_variable).value_counts()
+    )
+    grouped_data = grouped_data.loc["No", :] \
+        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
+              rsuffix="_yes") \
+        .rename(columns={"Q030/SQ004comment_yes": "RSE",
+                         "Q030/SQ004comment_no": "Non-RSE"})\
+        .fillna(value=0)
+    grouped_data = (grouped_data.divide(
+        grouped_data.sum(axis="index"), axis="columns"
+    ) * 100).reindex(response_options, fill_value=0)
+
+    # Plot community events respondents participated in grouped by RSE identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=sub_question,
+        grouping=grouping,
+        stats_to_plot=grouped_data,
+        y_tick_labels=grouped_data.index
+    )
+
+    return stats_tbl
+
+
+def _community_events_satisfaction(hifis_surveyval: HIFISSurveyval,
+                                   data: DataContainer, question_id: str,
+                                   stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on community event satisfaction
+    question: QuestionCollection = data.collection_for_id(question_id)
+
+    # Relative frequencies of observations
+    [n, summary_stats, unit] = stats.scale_array(question=question,
+                                                 orientation_wide=False,
+                                                 custom_stats=["mean", "std"])
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+         tbl=stats_tbl,
+         question=question,
+         count=n,
+         stats=summary_stats,
+         unit=unit,
+         compact=False
+    )
+
+    return stats_tbl
+
+
+def _community_events_size(hifis_surveyval: HIFISSurveyval,
+                           data: DataContainer, question_id: str,
+                           stats_tbl: DataFrame, cases: Set) -> DataFrame:
+    # Get data on preferred community event size
+    question: QuestionCollection = data.collection_for_id(question_id)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+         tbl=stats_tbl,
+         question=question,
+         count=n,
+         stats=question_freq_rel,
+         unit=unit,
+         compact=False
+    )
+
+    # Grouping variable
+    grouping: Question = data.question_for_id("Q008/_")
+    grouping.remove_answers(cases)
+    grouping_variable: Series = grouping.as_series()\
+        .replace({"Not sure": None})
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    grouped_data: DataFrame = question.as_data_frame()\
+        .groupby(grouping_variable)\
+        .sum()\
+        .divide(grouping_variable.value_counts(), axis="index") * 100
+
+    # Replace sub-question ids with sub-question labels as column header
+    sub_questions: Dict = {
+        sub_question.full_id: sub_question.label for sub_question in
+        question._questions.values()
+    }
+    grouped_data.rename(columns=sub_questions,
+                        index={"No": "Non-RSE", "Yes": "RSE"},
+                        inplace=True)
+
+    # Plot preferred community event size grouped by RSE identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        grouping=grouping,
+        stats_to_plot=grouped_data.T
+    )
+
+    return stats_tbl
+
+
+def _community_events_participation(hifis_surveyval: HIFISSurveyval,
+                                    data: DataContainer, question_id: str,
+                                    stats_tbl: DataFrame, cases: Set) \
+        -> DataFrame:
+    # Get data on preferred forms of community event participation
+    question: QuestionCollection = data.collection_for_id(question_id)
+
+    # Relative frequencies of observations
+    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
+
+    # Add question statistics to table
+    stats_tbl = stats.add_to_table(
+                     tbl=stats_tbl,
+                     question=question,
+                     count=n,
+                     stats=question_freq_rel,
+                     unit=unit,
+                     compact=False
+    )
+
+    # Grouping variable
+    grouping: Question = data.question_for_id("Q008/_")
+    grouping.remove_answers(cases)
+    grouping_variable: Series = grouping.as_series()\
+        .replace({"Not sure": None})
+
+    # Group data by RSEs vs non-RSEs and compute relative frequencies
+    grouped_data: DataFrame = question.as_data_frame()\
+        .groupby(grouping_variable)\
+        .sum()\
+        .divide(grouping_variable.value_counts(), axis="index") * 100
+
+    # Replace sub-question ids with sub-question labels as column header
+    sub_questions: Dict = {
+        sub_question.full_id: sub_question.label for sub_question in
+        question._questions.values()
+    }
+    grouped_data.rename(columns=sub_questions,
+                        index={"No": "Non-RSE", "Yes": "RSE"},
+                        inplace=True)
+
+    # Plot preferred forms of community event participation grouped by RSE
+    # identity
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        grouping=grouping,
+        stats_to_plot=grouped_data.T
+    )
+
+    return stats_tbl
diff --git a/scripts/modules/descriptive_stats.py b/scripts/modules/descriptive_stats.py
index da01d3f8..a1a6fe30 100644
--- a/scripts/modules/descriptive_stats.py
+++ b/scripts/modules/descriptive_stats.py
@@ -303,20 +303,6 @@ def bar_plot(hifis_surveyval: HIFISSurveyval, stats_to_plot: DataFrame,
         # Do not show legend with answer labels
         show_legend: bool = kwargs.get("legend", False)
 
-    # For array questions
-    if array_question:
-        # Use bar for each sub-question
-        stacked = True
-
-        # Show legend with answer labels
-        show_legend = True
-    else:
-        # Use bar for each answer option
-        stacked = False
-
-        # Do not show legend with answer labels
-        show_legend = False
-
     print(
         "\nBar plot for {}:\n"
         "Number of HIFIS survey respondents (n={}):\n{}".format(
-- 
GitLab


From 84b42b0e4640e5e45495a01f49fcbe1e39a391a7 Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 1 Dec 2021 16:00:52 +0100
Subject: [PATCH 08/13] Rename function to match issue description

---
 scripts/hifis-software-services.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/hifis-software-services.py b/scripts/hifis-software-services.py
index b7430668..da014c9a 100644
--- a/scripts/hifis-software-services.py
+++ b/scripts/hifis-software-services.py
@@ -9,14 +9,14 @@ from hifis_surveyval.hifis_surveyval import HIFISSurveyval
 
 def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
     """
-    Runs descriptive analysis for different questions of the HIFIS services
-    section of the HIFIS survey.
+    Compute descriptive statistics for general questions regarding HIFIS
+    services and communication.
     """
     # Relevant questions and corresponding analysis
     questions: Dict = {
         "Q035/_": _hifis_services_used,
         "Q036": _language_preference,
-        "Q037": _hifis_services_announcement
+        "Q037": _communication_channels
     }
 
     # Create statistics table containing total number of cases
@@ -102,17 +102,17 @@ def _language_preference(hifis_surveyval: HIFISSurveyval, data: DataContainer,
     # Plot preferred language for each HIFIS software service
     stats.bar_plot(
         hifis_surveyval=hifis_surveyval,
-        question=question,
         stats_to_plot=stats_to_plot.astype(float, errors="raise"),
+        question=question,
         array_question=True
     )
 
     return stats_tbl
 
 
-def _hifis_services_announcement(hifis_surveyval: HIFISSurveyval,
-                                 data: DataContainer, question_id: str,
-                                 stats_tbl: DataFrame, cases: Set) \
+def _communication_channels(hifis_surveyval: HIFISSurveyval,
+                            data: DataContainer, question_id: str,
+                            stats_tbl: DataFrame, cases: Set) \
         -> DataFrame:
     # Get data on preferred communication channels for HIFIS services updates
     question: QuestionCollection = data.collection_for_id(question_id)
@@ -134,8 +134,8 @@ def _hifis_services_announcement(hifis_surveyval: HIFISSurveyval,
     # Plot preferred communication channels
     stats.bar_plot(
         hifis_surveyval=hifis_surveyval,
-        question=question,
-        stats_to_plot=question_freq_rel.astype(float, errors="raise")
+        stats_to_plot=question_freq_rel.astype(float, errors="raise"),
+        question=question
     )
 
     return stats_tbl
-- 
GitLab


From 4dba1d5e42577099fabbaabe93ddf95e9eedb01f Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 1 Dec 2021 16:02:08 +0100
Subject: [PATCH 09/13] Rename script

---
 scripts/{hifis-software-services.py => hifis-services.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{hifis-software-services.py => hifis-services.py} (100%)

diff --git a/scripts/hifis-software-services.py b/scripts/hifis-services.py
similarity index 100%
rename from scripts/hifis-software-services.py
rename to scripts/hifis-services.py
-- 
GitLab


From 230b747597b0dfff96986fb7d02c5178860bb90b Mon Sep 17 00:00:00 2001
From: Katharina Dworatzyk <katharina.dworatzyk@dlr.de>
Date: Wed, 8 Dec 2021 13:04:58 +0100
Subject: [PATCH 10/13] Clean up

---
 report/sec/example-plots.tex | 23 +++++++++++++++++++++++
 report/tbl/sample-size.tex   |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/report/sec/example-plots.tex b/report/sec/example-plots.tex
index e57afe5c..bd490c90 100644
--- a/report/sec/example-plots.tex
+++ b/report/sec/example-plots.tex
@@ -11,3 +11,26 @@ Everything in round brackets is not yet implemented in the framework (WIP)
 	\includegraphics[width=\textwidth]{fig/example-grouped-box-plot.pdf}
 	\caption{These are box plots showing the characteristics of Question among different groups of HIFIS survey respondents. Overall, we observed a great variation within and between groups. The group comparison shows that Group 6 (n=XY) does most Question (median=X), while Group 2 (n=XY) does lowest. Statistical analysis did not show significant differences between groups (statistic=Z, p=0.89), suggesting a cross-group importance of Question.}
 \end{figure}
+
+Figures \ref{fig:wordcloud-1} -- \ref{fig:wordcloud-3} contain example plots to include word cloud graphics.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=\textwidth]{fig/Q034-word-cloud.pdf}
+	\caption{Word cloud 1.}
+	\label{fig:wordcloud-1}
+\end{figure}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=\textwidth]{fig/Q030_SQ002comment-word-cloud.pdf}
+	\caption{Word cloud 2.}
+	\label{fig:wordcloud-2}
+\end{figure}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=\textwidth]{fig/Cloud-word-cloud.pdf}
+	\caption{Word cloud 3.}
+	\label{fig:wordcloud-3}
+\end{figure}
diff --git a/report/tbl/sample-size.tex b/report/tbl/sample-size.tex
index 830db61c..1b54ebd9 100644
--- a/report/tbl/sample-size.tex
+++ b/report/tbl/sample-size.tex
@@ -27,4 +27,4 @@
 		\bottomrule
 	\end{tabular*}
 	\label{tab:sample_size} 
-\end{table}
\ No newline at end of file
+\end{table}
-- 
GitLab


From 28882bab79245451e58d61fabb4522951bf21a75 Mon Sep 17 00:00:00 2001
From: "Dworatzyk, Katharina" <katharina.dworatzyk@dlr.de>
Date: Sat, 11 Dec 2021 22:09:56 +0100
Subject: [PATCH 11/13] Add grouped analysis

---
 scripts/hifis-services.py           |  54 ++++
 scripts/hifis-software-community.py | 388 ----------------------------
 2 files changed, 54 insertions(+), 388 deletions(-)
 delete mode 100644 scripts/hifis-software-community.py

diff --git a/scripts/hifis-services.py b/scripts/hifis-services.py
index da014c9a..b829d5c6 100644
--- a/scripts/hifis-services.py
+++ b/scripts/hifis-services.py
@@ -1,7 +1,9 @@
 from typing import Dict, Set
 
 import modules.descriptive_stats as stats
+from hifis_surveyval.core import util
 from pandas import DataFrame
+from numpy import nan
 from hifis_surveyval.data_container import DataContainer, Question, \
     QuestionCollection
 from hifis_surveyval.hifis_surveyval import HIFISSurveyval
@@ -67,6 +69,29 @@ def _hifis_services_used(hifis_surveyval: HIFISSurveyval, data: DataContainer,
          compact=False
     )
 
+    # Grouping variable
+    grouping_id: str = "V002/_"
+    grouping: Question = data.question_for_id(grouping_id)
+
+    # Group data by HIFIS vs. Non-HIFIS
+    grouped_data: DataFrame = util.filter_and_group_series(
+        question.as_series(), grouping.as_series().dropna()
+    )
+
+    # Relative frequencies of observations
+    question_freq_rel["HIFIS"] = grouped_data["Yes"]\
+                                     .value_counts(normalize=True) * 100
+    question_freq_rel["Non-HIFIS"] = grouped_data["No"]\
+                                         .value_counts(normalize=True) * 100
+
+    # Plot use of HIFIS services for HIFIS vs. Non-HIFIS
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        stats_to_plot=question_freq_rel[["HIFIS", "Non-HIFIS"]],
+        question=question,
+        grouping=grouping
+    )
+
     return stats_tbl
 
 
@@ -138,4 +163,33 @@ def _communication_channels(hifis_surveyval: HIFISSurveyval,
         question=question
     )
 
+    # Grouping variable
+    grouping_id: str = "V002/_"
+    grouping: Question = data.question_for_id(grouping_id)
+
+    # Group data by HIFIS vs. Non-HIFIS
+    grouped_data: DataFrame = question.as_data_frame()\
+        .groupby(grouping.as_series())\
+        .sum()\
+        .divide(grouping.as_series().value_counts(), axis="index") * 100
+
+
+    # Replace sub-question ids with sub-question labels as column header
+    sub_questions: Dict = {
+        sub_question.full_id: sub_question.label for sub_question in
+        question._questions.values()
+    }
+    grouped_data.rename(columns=sub_questions,
+                        index={"No": "Non-HIFIS", "Yes": "HIFIS"},
+                        inplace=True)
+
+    # Plot preferred communication channels grouped by HIFIS vs. Non-HIFIS
+    stats.grouped_bar_plot(
+        hifis_surveyval=hifis_surveyval,
+        question=question,
+        grouping=grouping,
+        stats_to_plot=grouped_data.T
+    )
+
+
     return stats_tbl
diff --git a/scripts/hifis-software-community.py b/scripts/hifis-software-community.py
deleted file mode 100644
index 01d37af4..00000000
--- a/scripts/hifis-software-community.py
+++ /dev/null
@@ -1,388 +0,0 @@
-from typing import Dict, List, Set
-
-import modules.descriptive_stats as stats
-from pandas import DataFrame, Series
-from hifis_surveyval.data_container import DataContainer, Question, \
-    QuestionCollection
-from hifis_surveyval.hifis_surveyval import HIFISSurveyval
-
-
-def run(hifis_surveyval: HIFISSurveyval, data: DataContainer):
-    """
-    Compute descriptive statistics for WP Community specific questions to
-    analyze different aspects of respondent's interest in community events
-    """
-    # Output path
-    path: str = str(hifis_surveyval.settings.ANALYSIS_OUTPUT_PATH) + "\\"
-
-    # Relevant questions
-    questions: Dict = {
-        "Q008/_": _rse_identity,
-        "Q030": _community_events_interest,
-        "Q031": _community_events_satisfaction,
-        "Q032": _community_events_size,
-        "Q033": _community_events_participation
-    }
-
-    # Create statistics table containing total number of cases
-    community_sample: DataFrame = data.data_frame_for_ids(
-            requested_ids=list(questions.keys())
-        ).dropna(how="all")
-    community_tbl: DataFrame = stats.get_sample_size(community_sample)
-
-    # Exclude cases from analysis who did not answer any community question
-    incomplete_cases: Set = set(data.data_frame_for_ids(
-        requested_ids=data.question_collection_ids
-    ).index).difference(set(community_sample.index))
-
-    # Run the respective analysis for each of the specified questions
-    # and save descriptive statistics in joint table
-    for question_id, analysis in questions.items():
-        community_tbl = analysis(
-            hifis_surveyval=hifis_surveyval,
-            data=data,
-            question_id=question_id,
-            stats_tbl=community_tbl,
-            cases=incomplete_cases
-        )
-    community_tbl.rename({"mean": "M", "std": "SD"}, inplace=True)
-
-    # Display and export resulting table of sample characteristics
-    print("\nHIFIS survey community:\n{}\n"
-          .format(community_tbl))
-    community_tbl.to_latex(
-        buf=path + "community-statistics.tex",
-        header=True,
-        index=True,
-        na_rep="",
-        float_format="{:0.2f}".format,
-        column_format="@{}p{0.9\\textwidth}r@{}r@{}",
-        caption="Community",
-        label="tab:community_stats",
-        position="htbp",
-        longtable=False
-    )
-
-
-def _rse_identity(hifis_surveyval: HIFISSurveyval, data: DataContainer,
-                  question_id: str, stats_tbl: DataFrame, cases: Set) \
-        -> DataFrame:
-    # Get data on identification as research software engineer
-    question: Question = data.question_for_id(question_id)
-    question.remove_answers(cases)
-
-    # Relative frequencies of observations
-    [n, question_freq_rel, unit] = stats.nominal_single_choice(question)
-
-    # Add question statistics to table
-    stats_tbl = stats.add_to_table(
-         tbl=stats_tbl,
-         question=question,
-         count=n,
-         stats=question_freq_rel,
-         unit=unit,
-         compact=False
-    )
-
-    return stats_tbl
-
-
-def _community_events_interest(hifis_surveyval: HIFISSurveyval,
-                                   data: DataContainer, question_id: str,
-                                   stats_tbl: DataFrame, cases: Set) \
-        -> DataFrame:
-    # Get data on general interest in community events
-    question: QuestionCollection = data.collection_for_id(question_id)
-
-    # Relative frequencies of observations
-    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
-
-    # Add question statistics to table
-    stats_tbl = stats.add_to_table(
-         tbl=stats_tbl,
-         question=question,
-         count=n,
-         stats=question_freq_rel,
-         unit=unit,
-         compact=False
-    )
-
-    # Grouping variable
-    grouping: Question = data.question_for_id("Q008/_")
-    grouping.remove_answers(cases)
-    grouping_variable: Series = grouping.as_series()\
-        .replace({"Not sure": None})
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    question_variable: DataFrame = question.as_data_frame()
-    question_variable.drop(
-        columns=[column for column in question_variable.columns
-                 if column.endswith("comment")],
-        inplace=True
-    )
-    grouped_data: DataFrame = question_variable\
-        .groupby(grouping_variable)\
-        .sum()
-    grouped_data = grouped_data.divide(
-        grouped_data.sum(axis="columns"), axis="index"
-    ) * 100
-
-    # Replace sub-question ids with sub-question labels as column header
-    sub_questions: Dict = {
-        sub_question.full_id: sub_question.label for sub_question in
-        question._questions.values()
-    }
-    grouped_data.rename(columns=sub_questions,
-                        index={"No": "Non-RSE", "Yes": "RSE"},
-                        inplace=True)
-
-    # Plot general interest in community events grouped by RSE identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=question,
-        grouping=grouping,
-        stats_to_plot=grouped_data.T
-    )
-
-    # Plot reasons for not participating in any community events
-    sub_question: Question = data.question_for_id("Q030/SQ001comment")
-    [_, sq001_freq_rel, _] = stats.nominal_single_choice(sub_question)
-    stats.bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        stats_to_plot=DataFrame(sq001_freq_rel)
-    )
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    sub_question_variable: Series = sub_question.as_series()
-    response_options: List[str] = [
-        answer_option.label for answer_option in
-        sub_question._answer_options.values()
-    ]
-    grouped_data: DataFrame = DataFrame(
-        sub_question_variable.groupby(grouping_variable).value_counts()
-    )
-    grouped_data = grouped_data.loc["No", :]\
-        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
-              rsuffix="_yes")\
-        .rename(columns={"Q030/SQ001comment_yes": "RSE",
-                         "Q030/SQ001comment_no": "Non-RSE"}) \
-        .fillna(value=0)
-    grouped_data = (grouped_data.divide(
-        grouped_data.sum(axis="index"), axis="columns"
-    ) * 100).reindex(response_options, fill_value=0)
-
-    # Plot reasons for not participating in any community events grouped by
-    # RSE identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        grouping=grouping,
-        stats_to_plot=grouped_data,
-        y_tick_labels=grouped_data.index
-    )
-
-    # Plot community events respondents have heard of
-    sub_question: Question = data.question_for_id("Q030/SQ003comment")
-    [_, sq003_freq_rel, _] = stats.nominal_single_choice(sub_question)
-    stats.bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        stats_to_plot=DataFrame(sq003_freq_rel)
-    )
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    sub_question_variable: Series = sub_question.as_series()
-    response_options: List[str] = [
-        answer_option.label for answer_option in
-        sub_question._answer_options.values()
-    ]
-    grouped_data: DataFrame = DataFrame(
-        sub_question_variable.groupby(grouping_variable).value_counts()
-    )
-    grouped_data = grouped_data.loc["No", :] \
-        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
-              rsuffix="_yes") \
-        .rename(columns={"Q030/SQ003comment_yes": "RSE",
-                         "Q030/SQ003comment_no": "Non-RSE"})\
-        .fillna(value=0)
-    grouped_data = (grouped_data.divide(
-        grouped_data.sum(axis="index"), axis="columns"
-    ) * 100).reindex(response_options, fill_value=0)
-
-    # Plot events respondents have heard of grouped by RSE identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        grouping=grouping,
-        stats_to_plot=grouped_data,
-        y_tick_labels=grouped_data.index
-    )
-
-    # Plot community events respondents participated in
-    sub_question: Question = data.question_for_id("Q030/SQ004comment")
-    [_, sq004_freq_rel, _] = stats.nominal_single_choice(sub_question)
-    stats.bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        stats_to_plot=DataFrame(sq004_freq_rel)
-    )
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    sub_question_variable: Series = sub_question.as_series()
-    response_options: List[str] = [
-        answer_option.label for answer_option in
-        sub_question._answer_options.values()
-    ]
-    grouped_data: DataFrame = DataFrame(
-        sub_question_variable.groupby(grouping_variable).value_counts()
-    )
-    grouped_data = grouped_data.loc["No", :] \
-        .join(grouped_data.loc["Yes", :], how="outer", lsuffix="_no",
-              rsuffix="_yes") \
-        .rename(columns={"Q030/SQ004comment_yes": "RSE",
-                         "Q030/SQ004comment_no": "Non-RSE"})\
-        .fillna(value=0)
-    grouped_data = (grouped_data.divide(
-        grouped_data.sum(axis="index"), axis="columns"
-    ) * 100).reindex(response_options, fill_value=0)
-
-    # Plot community events respondents participated in grouped by RSE identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=sub_question,
-        grouping=grouping,
-        stats_to_plot=grouped_data,
-        y_tick_labels=grouped_data.index
-    )
-
-    return stats_tbl
-
-
-def _community_events_satisfaction(hifis_surveyval: HIFISSurveyval,
-                                   data: DataContainer, question_id: str,
-                                   stats_tbl: DataFrame, cases: Set) \
-        -> DataFrame:
-    # Get data on community event satisfaction
-    question: QuestionCollection = data.collection_for_id(question_id)
-
-    # Relative frequencies of observations
-    [n, summary_stats, unit] = stats.scale_array(question=question,
-                                                 orientation_wide=False,
-                                                 custom_stats=["mean", "std"])
-
-    # Add question statistics to table
-    stats_tbl = stats.add_to_table(
-         tbl=stats_tbl,
-         question=question,
-         count=n,
-         stats=summary_stats,
-         unit=unit,
-         compact=False
-    )
-
-    return stats_tbl
-
-
-def _community_events_size(hifis_surveyval: HIFISSurveyval,
-                           data: DataContainer, question_id: str,
-                           stats_tbl: DataFrame, cases: Set) -> DataFrame:
-    # Get data on preferred community event size
-    question: QuestionCollection = data.collection_for_id(question_id)
-
-    # Relative frequencies of observations
-    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
-
-    # Add question statistics to table
-    stats_tbl = stats.add_to_table(
-         tbl=stats_tbl,
-         question=question,
-         count=n,
-         stats=question_freq_rel,
-         unit=unit,
-         compact=False
-    )
-
-    # Grouping variable
-    grouping: Question = data.question_for_id("Q008/_")
-    grouping.remove_answers(cases)
-    grouping_variable: Series = grouping.as_series()\
-        .replace({"Not sure": None})
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    grouped_data: DataFrame = question.as_data_frame()\
-        .groupby(grouping_variable)\
-        .sum()\
-        .divide(grouping_variable.value_counts(), axis="index") * 100
-
-    # Replace sub-question ids with sub-question labels as column header
-    sub_questions: Dict = {
-        sub_question.full_id: sub_question.label for sub_question in
-        question._questions.values()
-    }
-    grouped_data.rename(columns=sub_questions,
-                        index={"No": "Non-RSE", "Yes": "RSE"},
-                        inplace=True)
-
-    # Plot preferred community event size grouped by RSE identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=question,
-        grouping=grouping,
-        stats_to_plot=grouped_data.T
-    )
-
-    return stats_tbl
-
-
-def _community_events_participation(hifis_surveyval: HIFISSurveyval,
-                                    data: DataContainer, question_id: str,
-                                    stats_tbl: DataFrame, cases: Set) \
-        -> DataFrame:
-    # Get data on preferred forms of community event participation
-    question: QuestionCollection = data.collection_for_id(question_id)
-
-    # Relative frequencies of observations
-    [n, question_freq_rel, unit] = stats.nominal_multiple_choice(question)
-
-    # Add question statistics to table
-    stats_tbl = stats.add_to_table(
-                     tbl=stats_tbl,
-                     question=question,
-                     count=n,
-                     stats=question_freq_rel,
-                     unit=unit,
-                     compact=False
-    )
-
-    # Grouping variable
-    grouping: Question = data.question_for_id("Q008/_")
-    grouping.remove_answers(cases)
-    grouping_variable: Series = grouping.as_series()\
-        .replace({"Not sure": None})
-
-    # Group data by RSEs vs non-RSEs and compute relative frequencies
-    grouped_data: DataFrame = question.as_data_frame()\
-        .groupby(grouping_variable)\
-        .sum()\
-        .divide(grouping_variable.value_counts(), axis="index") * 100
-
-    # Replace sub-question ids with sub-question labels as column header
-    sub_questions: Dict = {
-        sub_question.full_id: sub_question.label for sub_question in
-        question._questions.values()
-    }
-    grouped_data.rename(columns=sub_questions,
-                        index={"No": "Non-RSE", "Yes": "RSE"},
-                        inplace=True)
-
-    # Plot preferred forms of community event participation grouped by RSE
-    # identity
-    stats.grouped_bar_plot(
-        hifis_surveyval=hifis_surveyval,
-        question=question,
-        grouping=grouping,
-        stats_to_plot=grouped_data.T
-    )
-
-    return stats_tbl
-- 
GitLab


From 5140c64f480146a955786f94565358258dc6f71c Mon Sep 17 00:00:00 2001
From: "Dworatzyk, Katharina" <katharina.dworatzyk@dlr.de>
Date: Sun, 12 Dec 2021 19:12:59 +0100
Subject: [PATCH 12/13] Draft HIFIS services section

---
 report/hifis-survey-2021.tex  | 27 +++++++++++++++------------
 report/sec/hifis-services.tex | 29 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 12 deletions(-)
 create mode 100644 report/sec/hifis-services.tex

diff --git a/report/hifis-survey-2021.tex b/report/hifis-survey-2021.tex
index d3277cc5..02ea1eb4 100644
--- a/report/hifis-survey-2021.tex
+++ b/report/hifis-survey-2021.tex
@@ -10,14 +10,15 @@
 
 \title{\textbf{HIFIS Survey Report}}
 \author{
-	Maximilian Dolling, \\
 	Katharina Dworatzyk, \\
-	Fredo Erxleben, \\
+	Thomas Förster, \\
 	Laura Marie Holz, \\
 	Christian Hüser, \\
 	Tobias Huste, \\
 	Uwe Jandt, \\
-	Tobias Schlauch
+	Tobias Schlauch, \\
+	Martin Stoffers, \\
+	Benjamin Wolff
 }
 \date{\textbf{December 2021}}
 
@@ -27,32 +28,33 @@
 		\maketitle
 	\end{titlepage}
 	
-	\section{Introduction} \label{sec:introduction}
-	\input{sec/introduction.tex}
+	\section{HIFIS Survey 2021} \label{sec:methods}
 	
-	\section{Methods} \label{sec:methods}
+	\subsection{Survey structure} \label{subsec:survey_structure}
+	\input{sec/survey-structure.tex}
 	
 	\subsection{Participants} \label{subsec:participants}
 	\input{sec/participants.tex}
 	
-	\subsection{Survey structure} \label{subsec:survey_structure}
-	\input{sec/survey-structure.tex}
-	
 	\subsection{Statistical analysis} \label{subsec:analysis}
 	\input{sec/statistical-analysis.tex}
 	
 	\subsection{HIFIS Surveyval} \label{subsec:framework}
 	\input{sec/hifis-surveyval.tex}
+	\newpage
 	
 	\section{Results and discussion} \label{sec:results}
 	
 	\subsection{Sample characteristics} \label{subsec:sample-characteristics}
 	\input{sec/sample-characteristics.tex}
 	
-	\subsection{HIFIS Cloud} \label{subsec:cloud}
+	\subsection{HIFIS Services}
+	\input{sec/hifis-services.tex}
+	
+	\subsection{HIFIS Cloud Services} \label{subsec:cloud}
 	\input{sec/hifis-cloud.tex}
 	
-	\subsection{HIFIS Software} \label{subsec:services}
+	\subsection{HIFIS Software Services} \label{subsec:services}
 	\input{sec/hifis-software.tex}
 	
 	\subsubsection{Technology} \label{subsubsec:technology}
@@ -69,7 +71,8 @@
 	
 	\subsubsection{Software} \label{subsubsec:software}
 	\input{sec/hifis-software-software.tex}
-
+	\newpage
+	
 	\section{Outlook} \label{sec:outlook}
 	\input{sec/outlook.tex}
 	
diff --git a/report/sec/hifis-services.tex b/report/sec/hifis-services.tex
new file mode 100644
index 00000000..697b5679
--- /dev/null
+++ b/report/sec/hifis-services.tex
@@ -0,0 +1,29 @@
+The \textit{HIFIS} platform consists of three competence clusters, \textit{Backbone}, \textit{Cloud} and \textit{Software Services}, that together aim at providing a \textit{Helmholtz}-wide infrastructure for easily accessible cloud services as well as education and training on sustainable scientific software engineering. The present study focused on the work of \textit{Cloud} and \textit{Software Services} but we also included three questions addressing general aspects of \textit{HIFIS Services}. First of all, we were interested in assessing the overall use or recognition of \textit{HIFIS Services}. Figure \ref{fig:HIFIS_services_use} shows that there were only marginal differences between respondents belonging to centres that are actively involved in \textit{HIFIS} and those that are not. Less than 1 out of 5 respondents indicated that they used any of the services provided by \textit{HIFIS} in the past 12 months. As we did not specify any services, this result can be interpreted differently. It could mean that these respondents actually did not use any of the services or that they did not recognize using \textit{HIFIS services}. The latter would be a positive outcome in particular for \textit{Cloud} and \textit{Backbone}, as it demonstrates a seamless integration of their services in respondents workflows. Interestingly, respondents from centres not involved in \textit{HIFIS} were slightly more aware of using \textit{HIFIS} services than respondentes from \textit{HIFIS} centres.
+
+\begin{figure}[hbt]
+	\centering
+	\includegraphics[width=\textwidth]{fig/Q035-by-V002-bar-plot.pdf}
+	\caption{Use of \textit{HIFIS} services among \textit{Helmholtz} centres involved in \textit{HIFIS} (n=271) and \textit{Helmholtz} centres not involved in \textit{HIFIS} (n=78).}
+	\label{fig:HIFIS_services_use}
+\end{figure}
+
+The use of tools or support offerings strongly depends on their accessibility. An important factor for the successful use of services that are mediated through social interaction is the language. Within the \textit{HIFIS} team, communication is mostly carried out in English and most services are currently offered in English due to the highly international target group. However, language is not only known as a facilitator but can also pose additional barriers if potential users of a service are not fluent in a particular language. We expected that specifically for services that include teaching as a key element, the acceptance of English as a default language would be lower. Figure \ref{fig:HIFIS_services_language} supports this view partly. For workshops, community events, and consultations, all of which involve a high level of verbal interaction, acceptance of English was comparatively lower than for reading material such as guidelines or documentation.Based on the present results, it appears vital to keep offering at least consultations also in German. Overall, the results suggest a high level of acceptance for offering services solely in English, however. It would have been interesting to compare these results to the acceptance of German as the default language. 
+
+\begin{figure}[hbt]
+	\centering
+	\includegraphics[width=\textwidth]{fig/Q036-bar-plot.pdf}
+	\caption{Language acceptance for different \textit{HIFIS} services.}
+	\label{fig:HIFIS_services_language}
+\end{figure}
+
+
+To optimize our communication strategy in the future and to get an idea how to reach non-\textit{HIFIS} centres, we asked respondents to indicate their preferred communication channels. As Figure \ref{fig:HIFIS_services_communication} shows, the majority of respondents chose mailing lists or the HIFIS homepage as their primary communication channel. Twitter and RSS feeds were not considered as suitable communication channels. And again, we did not find any considerable differences between respondents from \textit{HIFIS} centres compared to those from centres that are not directly involved in \textit{HIFIS}, suggesting that there is no need to adjust our communication strategy depending on the centre.
+
+\begin{figure}[hbt]
+	\centering
+	\includegraphics[width=\textwidth]{fig/Q037-by-V002-bar-plot.pdf}
+	\caption{Preferred communication channels among \textit{Helmholtz} centres involved in \textit{HIFIS} (n=265) and \textit{Helmholtz} centres not involved in \textit{HIFIS} (n=77).}
+	\label{fig:HIFIS_services_communication}
+\end{figure}
+
+In the following, we will present our results for the clusters \textit{Cloud Services} and \textit{Software Services} separately. Cloud Services, on the one hand, provide services for the entire \textit{Helmholtz Association} and were, therefore, interested in a broader sample. Software services, on the other hand, focused mainly on respondents who were actively involved in software development during the past 12 months or were interested in improving their software development practices.
\ No newline at end of file
-- 
GitLab


From 72e1873c780a3b8d09adacdbb0072dbdd9498ae8 Mon Sep 17 00:00:00 2001
From: "Dworatzyk, Katharina" <katharina.dworatzyk@dlr.de>
Date: Sun, 12 Dec 2021 19:14:22 +0100
Subject: [PATCH 13/13] Remove introduction

---
 report/sec/introduction.tex | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 report/sec/introduction.tex

diff --git a/report/sec/introduction.tex b/report/sec/introduction.tex
deleted file mode 100644
index 8b137891..00000000
--- a/report/sec/introduction.tex
+++ /dev/null
@@ -1 +0,0 @@
-
-- 
GitLab