From 96881df3b0659a42d66d8c95548d7a7e7870a079 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 4 Feb 2025 15:47:47 +0100 Subject: [PATCH] copied files --- .../tomcat/context_files/.gitignore | 2 + .../tomcat/tomcat_context_template.xml | 23 + .../frost_views.sql | 339 +++++++ .../postgres-ddl.sql | 175 ++++ .../postgres-dml.sql | 14 + .../sta/datastream.sql | 144 +++ .../sta/feature.sql | 33 + .../sta/location.sql | 108 +++ .../sta/observation.sql | 36 + .../sta/observed_property.sql | 28 + .../sta/schema_context.sql | 20 + .../sta/sensor.sql | 84 ++ .../CreateThingInDatabaseAction/sta/thing.sql | 84 ++ src/timeio/__init__.py | 1 + src/timeio/base_handler.py | 187 ++++ src/timeio/check.py | 26 + src/timeio/crontab_setup.py | 115 +++ src/timeio/databases.py | 208 +++++ src/timeio/db_setup.py | 396 ++++++++ src/timeio/file_ingest.py | 133 +++ src/timeio/frost.py | 61 ++ src/timeio/frost_setup.py | 31 + src/timeio/grafana_dashboard_setup.py | 483 ++++++++++ src/timeio/grafana_user_setup.py | 186 ++++ src/timeio/minio_setup.py | 91 ++ src/timeio/mqtt_ingest.py | 56 ++ src/timeio/mqtt_user_setup.py | 78 ++ src/timeio/parsing.py | 413 +++++++++ src/timeio/qualcontrol.py | 844 ++++++++++++++++++ src/timeio/run_qc.py | 81 ++ src/timeio/thing.py | 204 +++++ src/timeio/utils/__init__.py | 3 + src/timeio/utils/cast.py | 47 + src/timeio/utils/common.py | 98 ++ src/timeio/utils/crypto.py | 17 + src/timeio/utils/errors.py | 62 ++ src/timeio/utils/journaling.py | 121 +++ src/timeio/utils/psycopg_helper.py | 78 ++ src/timeio/utils/types.py | 151 ++++ src/timeio/version.py | 1 + 40 files changed, 5262 insertions(+) create mode 100644 src/timeio/CreateNewFrostInstanceAction/tomcat/context_files/.gitignore create mode 100644 src/timeio/CreateNewFrostInstanceAction/tomcat/tomcat_context_template.xml create mode 100644 src/timeio/CreateThingInDatabaseAction/frost_views.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/postgres-ddl.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/postgres-dml.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/datastream.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/feature.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/location.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/observation.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/observed_property.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/schema_context.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/sensor.sql create mode 100644 src/timeio/CreateThingInDatabaseAction/sta/thing.sql create mode 100644 src/timeio/__init__.py create mode 100644 src/timeio/base_handler.py create mode 100644 src/timeio/check.py create mode 100644 src/timeio/crontab_setup.py create mode 100644 src/timeio/databases.py create mode 100644 src/timeio/db_setup.py create mode 100644 src/timeio/file_ingest.py create mode 100644 src/timeio/frost.py create mode 100644 src/timeio/frost_setup.py create mode 100644 src/timeio/grafana_dashboard_setup.py create mode 100644 src/timeio/grafana_user_setup.py create mode 100644 src/timeio/minio_setup.py create mode 100644 src/timeio/mqtt_ingest.py create mode 100644 src/timeio/mqtt_user_setup.py create mode 100644 src/timeio/parsing.py create mode 100755 src/timeio/qualcontrol.py create mode 100644 src/timeio/run_qc.py create mode 100644 src/timeio/thing.py create mode 100644 src/timeio/utils/__init__.py create mode 100644 src/timeio/utils/cast.py create mode 100644 src/timeio/utils/common.py create mode 100644 src/timeio/utils/crypto.py create mode 100644 src/timeio/utils/errors.py create mode 100644 src/timeio/utils/journaling.py create mode 100644 src/timeio/utils/psycopg_helper.py create mode 100644 src/timeio/utils/types.py create mode 100644 src/timeio/version.py diff --git a/src/timeio/CreateNewFrostInstanceAction/tomcat/context_files/.gitignore b/src/timeio/CreateNewFrostInstanceAction/tomcat/context_files/.gitignore new file mode 100644 index 00000000..c96a04f0 --- /dev/null +++ b/src/timeio/CreateNewFrostInstanceAction/tomcat/context_files/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/src/timeio/CreateNewFrostInstanceAction/tomcat/tomcat_context_template.xml b/src/timeio/CreateNewFrostInstanceAction/tomcat/tomcat_context_template.xml new file mode 100644 index 00000000..5025a82e --- /dev/null +++ b/src/timeio/CreateNewFrostInstanceAction/tomcat/tomcat_context_template.xml @@ -0,0 +1,23 @@ +<?xml version="1.0" encoding="UTF-8"?> +<Context path="/{schema}" docBase="/share/FROST-Server.war"> + <Parameter override="false" name="ApiVersion" value="v1.0" description="The version tag of the API used in the URL."/> + <Parameter override="false" name="serviceRootUrl" value="{tomcat_proxy_url}{schema}" description="The base URL of the SensorThings Server without version."/> + <Parameter override="false" name="defaultCount" value="false" description="The default value for the $count query option."/> + <Parameter override="false" name="defaultTop" value="100" description="The default value for the $top query option."/> + <Parameter override="false" name="maxTop" value="1000" description="The maximum allowed value for the $top query option."/> + <Parameter override="false" name="maxDataSize" value="25000000" description="The maximum allowed estimated data size (in bytes) for responses."/> + <Parameter override="false" name="bus.busImplementationClass" value="de.fraunhofer.iosb.ilt.frostserver.messagebus.InternalMessageBus" description="The java class used for connecting to the message bus."/> + <Parameter override="false" name="persistence.persistenceManagerImplementationClass" value="de.fraunhofer.iosb.ilt.frostserver.persistence.pgjooq.imp.PostgresPersistenceManagerLong" description="The java class used for persistence (must implement PersistenceManaher interface)"/> + <Parameter override="false" name="persistence.idGenerationMode" value="ServerGeneratedOnly" description="Mode for id generation when using PostgresPersistenceManagerString."/> + <Parameter override="false" name="persistence.autoUpdateDatabase" value="false" description="Automatically apply database updates."/> + <Parameter override="false" name="persistence.alwaysOrderbyId" value="false" description="Always add an 'orderby=id asc' to queries to ensure consistent paging."/> + <Parameter override="false" name="persistence.db_jndi_datasource" value="jdbc/sensorThings" description="JNDI data source name"/> + <Resource + name="jdbc/sensorThings" auth="Container" + type="javax.sql.DataSource" driverClassName="org.postgresql.Driver" + url="jdbc:{db_url}" + username="{username}" password="{password}" + maxTotal="20" maxIdle="10" maxWaitMillis="-1" + defaultAutoCommit="false" + /> +</Context> \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/frost_views.sql b/src/timeio/CreateThingInDatabaseAction/frost_views.sql new file mode 100644 index 00000000..4e9b887a --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/frost_views.sql @@ -0,0 +1,339 @@ +BEGIN; + +--- CREATE VIEW "THINGS" --- +DROP VIEW IF EXISTS "THINGS" CASCADE; +CREATE OR REPLACE VIEW "THINGS" AS +SELECT DISTINCT c.id AS "ID", + c.description AS "DESCRIPTION", + c.label AS "NAME", + jsonb_build_object( + 'url', %(sms_url)s || 'configurations/' || c.id, + 'pid', c.persistent_identifier, + 'status', c.status, + 'mobile', CASE + WHEN MAX(cdl.begin_date) IS NULL THEN 'false' + ELSE 'true' + END, + 'organizations', ARRAY_AGG(DISTINCT co.organization), + 'projects', ARRAY_AGG(DISTINCT c.project), + 'when_dynamic', ARRAY_AGG( + DISTINCT CASE + WHEN cdl.end_date IS NULL THEN + TO_CHAR(cdl.begin_date, + 'YYYY-MM-DD HH24:MI:SS TZ') + ELSE + TO_CHAR(cdl.begin_date, + 'YYYY-MM-DD HH24:MI:SS TZ') || + '/' || + TO_CHAR(cdl.end_date, 'YYYY-MM-DD HH24:MI:SS TZ') + END + ), + 'when_stationary', ARRAY_AGG( + DISTINCT CASE + WHEN csl.end_date IS NULL THEN + TO_CHAR(csl.begin_date, + 'YYYY-MM-DD HH24:MI:SS TZ') + ELSE + TO_CHAR(csl.begin_date, + 'YYYY-MM-DD HH24:MI:SS TZ') || + '/' || + TO_CHAR(csl.end_date, 'YYYY-MM-DD HH24:MI:SS TZ') + END + ), + 'contacts', ARRAY_AGG( + DISTINCT jsonb_build_object( + 'name', CONCAT(co.given_name, ' ', co.family_name), + 'email', co.email, + 'organization', co.organization, + 'orcid', co.orcid + )) + ) AS "PROPERTIES" +FROM public.sms_configuration c + JOIN public.sms_configuration_contact_role ccr ON c.id = ccr.configuration_id + JOIN public.sms_contact co ON ccr.contact_id = co.id + JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl + ON c.id = cdl.configuration_id + LEFT JOIN public.sms_configuration_static_location_begin_action csl + ON c.id = csl.configuration_id +WHERE ((cdl.configuration_id IS NOT NULL) OR (csl.configuration_id IS NOT NULL)) + AND dsl.datasource_id = %(tsm_schema)s + AND c.is_public +GROUP BY c.id, c.description, c.label, c.persistent_identifier, c.status, c.is_public, + cdl.configuration_id, csl.configuration_id, dsl.datasource_id +ORDER BY c.id ASC; + + +--- CREATE VIEW SENSORS --- +DROP VIEW IF EXISTS "SENSORS" CASCADE; +CREATE OR REPLACE VIEW "SENSORS" AS +SELECT DISTINCT d.id AS "ID", + d.short_name AS "NAME", + d.description AS "DESCRIPTION", + 'html'::text AS "ENCODING_TYPE", + %(sms_url)s || 'backend/api/v1/devices/' || d.id || + '/sensorml' AS "METADATA", + jsonb_build_object( + 'url', %(sms_url)s || 'devices/' || d.id, + 'pid', d.persistent_identifier, + 'type', d.device_type_name, + 'contacts', ARRAY_AGG(DISTINCT jsonb_build_object( + 'email', co.email, + 'organization', co.organization, + 'name', CONCAT(co.given_name, ' ', co.family_name), + 'orcid', co.orcid + )), + 'manufacturer', d.manufacturer_name, + 'model', d.model, + 'serialNumber', d.serial_number + ) AS "PROPERTIES" +FROM public.sms_device d + JOIN public.sms_device_mount_action dma ON d.id = dma.device_id + JOIN public.sms_configuration_contact_role ccr + ON dma.configuration_id = ccr.configuration_id + JOIN public.sms_contact co ON ccr.contact_id = co.id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + WHERE dsl.datasource_id = %(tsm_schema)s + AND d.is_public +GROUP BY d.id, d.short_name, d.description, d.persistent_identifier, d.device_type_name, + d.manufacturer_name, d.model, d.serial_number, d.is_public, dsl.datasource_id +ORDER BY d.id ASC; + + +--- CREATE VIEW LOCATIONS --- +DROP VIEW IF EXISTS "LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "LOCATIONS" AS +SELECT DISTINCT csl.id AS "ID", + csl.label AS "NAME", + csl.begin_description AS "DESCRIPTION", + 'application/geo+json'::text AS "ENCODING_TYPE", + public.ST_ASGeoJSON( + public.ST_SetSRID( + public.ST_MakePoint(csl.x, csl.y), + 4326 + ) + ) AS "LOCATION", + jsonb_build_object() AS "PROPERTIES" +FROM public.sms_configuration_static_location_begin_action csl + JOIN public.sms_configuration c ON csl.configuration_id = c.id + JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + WHERE dsl.datasource_id = %(tsm_schema)s + AND c.is_public +ORDER BY csl.id; + + +--- CREATE VIEW THINGS_LOCATIONS --- +DROP VIEW IF EXISTS "THINGS_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "THINGS_LOCATIONS" AS +SELECT DISTINCT ON (c.id) c.id AS "THING_ID", + csl.id AS "LOCATION_ID" +FROM public.sms_configuration c + JOIN public.sms_configuration_static_location_begin_action csl + ON c.id = csl.configuration_id + JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + WHERE dsl.datasource_id = %(tsm_schema)s + AND c.is_public +ORDER BY c.id, csl.begin_date DESC; + + +--- CREATE VIEW LOCATIONS_HIST_LOCATIONS --- +DROP VIEW IF EXISTS "LOCATIONS_HIST_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "LOCATIONS_HIST_LOCATIONS" AS +--build cte that returns configuration_ids and location_ids (configuration_static_location_begin_action.id) +WITH config_locations AS (SELECT DISTINCT c.id AS c_id, + csl.id AS csl_id + FROM public.sms_configuration c + JOIN public.sms_configuration_static_location_begin_action csl + ON c.id = csl.configuration_id + JOIN public.sms_device_mount_action dma + on c.id = dma.configuration_id + JOIN public.sms_datastream_link dsl + on dma.id = dsl.device_mount_action_id + WHERE dsl.datasource_id = %(tsm_schema)s + AND c.is_public), +-- return newest location (highest csl_id) for each configuration_id +-- might need to be decided based on timestamp. +-- for now, taking the highest id works fine + locations AS (SELECT c_id, + MAX(csl_id) AS max_csl_id + FROM config_locations + GROUP BY c_id) +-- join locations on newest locaiton per configuration id +SELECT loc.max_csl_id AS "LOCATION_ID", + cl.csl_id AS "HIST_LOCATION_ID" +-- join locations on newest location on configuration id +FROM config_locations cl + JOIN locations loc ON cl.c_id = loc.c_id +-- leave out newest location in join +WHERE cl.csl_id <> loc.max_csl_id +ORDER BY loc.max_csl_id ASC, cl.csl_id ASC +--returns hist_location_id for mapped to current location_id for each configuration_id +; + + +--- CREATE VIEW HIST_LOCATIONS --- +DROP VIEW IF EXISTS "HIST_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "HIST_LOCATIONS" AS +WITH cte AS (SELECT c.id AS "THING_ID", + csl.id "ID", + csl.begin_date AS "TIME", + ROW_NUMBER() + OVER (PARTITION BY c.id ORDER BY csl.begin_date DESC) AS row_num + FROM public.sms_configuration c + JOIN public.sms_configuration_static_location_begin_action csl + ON c.id = csl.configuration_id + WHERE c.is_public) +SELECT DISTINCT "THING_ID", + "ID", + "TIME" +FROM cte + JOIN public.sms_device_mount_action dma + on cte."THING_ID" = dma.configuration_id + JOIN public.sms_datastream_link dsl on dma.id = dsl.device_mount_action_id +WHERE row_num > 1 + AND dsl.datasource_id = %(tsm_schema)s; + + +--- CREATE VIEW FEATURE_OF_INTEREST --- + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_name = 'FEATURES' + AND table_schema = %(tsm_schema)s + AND table_type = 'BASE TABLE') + THEN EXECUTE 'DROP TABLE "FEATURES" CASCADE'; + ELSIF EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_name = 'FEATURES' + AND table_schema = %(tsm_schema)s + AND table_type = 'VIEW' + ) + THEN EXECUTE 'DROP VIEW "FEATURES" CASCADE'; + END IF; +END $$; +CREATE TABLE "FEATURES" ( + "ID" serial, + "NAME" text, + "DESCRIPTION" text, + "ENCODING_TYPE" text, + "FEATURE" jsonb, + "PROPERTIES" jsonb +); + +--- CREATE VIEW OBSERVATIONS --- +DROP VIEW IF EXISTS "OBSERVATIONS" CASCADE; +CREATE OR REPLACE VIEW "OBSERVATIONS" AS +SELECT + o.result_boolean AS "RESULT_BOOLEAN", + o.result_quality AS "RESULT_QUALITY", + o.phenomenon_time_start AS "PHENOMENON_TIME_START", + jsonb_build_object() AS "PARAMETERS", + dsl.device_property_id AS "DATASTREAM_ID", + o.result_string AS "RESULT_STRING", + o.result_type AS "RESULT_TYPE", + o.valid_time_end AS "VALID_TIME_END", + o.phenomenon_time_end AS "PHENOMENON_TIME_END", + null AS "FEATURE_ID", + row_number() OVER () AS "ID", + o.result_json AS "RESULT_JSON", + o.result_time AS "RESULT_TIME", + o.result_number AS "RESULT_NUMBER", + o.valid_time_start AS "VALID_TIME_START" +FROM public.sms_datastream_link dsl +JOIN observation o ON o.datastream_id = dsl.datastream_id +JOIN public.sms_device_mount_action dma ON dma.id = dsl.device_mount_action_id +JOIN public.sms_device d ON d.id = dma.device_id +JOIN public.sms_configuration c ON c.id = dma.configuration_id +WHERE c.is_public AND d.is_public AND dsl.datasource_id = %(tsm_schema)s; + + +--- CREATE VIEW DATASTREAMS --- +DROP VIEW IF EXISTS "DATASTREAMS" CASCADE; +CREATE OR REPLACE VIEW "DATASTREAMS" AS SELECT + dsl.device_property_id AS "ID", + CONCAT( + c.label, '_', + d.short_name, '_', + dp.property_name, '_', + dma.offset_z, '_', + dp.aggregation_type_name) AS "NAME", + CONCAT( + d.short_name, '_', + dp.property_name, '_', + dma.offset_z, ' at site ', + c.label, ' with aggregation function ', + dp.aggregation_type_name) AS "DESCRIPTION", + c.id AS "THING_ID", + d.id AS "SENSOR_ID", + 'OM_Observation' AS "OBSERVATION_TYPE", + dma.begin_date AS "PHENOMENON_TIME_START", + dma.begin_date AS "RESULT_TIME_START", + dma.end_date AS "PHENOMENON_TIME_END", + dma.end_date AS "RESULT_TIME_END", + -- we don't provide an observed area, as this is really expensive + null as "OBSERVED_AREA", + dp.unit_uri AS "UNIT_DEFINITION", + dp.property_name AS "UNIT_NAME", + dp.unit_name AS "UNIT_SYMBOL", + CASE + WHEN dp.property_uri = '' THEN NULL + ELSE reverse(split_part(reverse(dp.property_uri::text), '/'::text, + 2))::integer + END as "OBS_PROPERTY_ID", + jsonb_build_object( + 'sensorOutput', dp.property_name, + 'offset', jsonb_build_object( + 'z', dma.offset_z, + 'unit', 'm'), + 'aggregation', jsonb_build_object( + 'period', dsl.aggregation_period, + 'function', dp.aggregation_type_name), + 'quality', jsonb_build_object( + 'resolution', dp.resolution, + 'resolution_unit', dp.resolution_unit_name, + 'accuracy', dp.accuracy, + 'measuring_range_min', dp.measuring_range_min, + 'measuring_range_max', dp.measuring_range_max), + 'license', '') AS "PROPERTIES" + +FROM public.sms_datastream_link dsl + JOIN public.sms_device_mount_action dma ON dma.id = dsl.device_mount_action_id + JOIN public.sms_device d ON d.id = dma.device_id + JOIN public.sms_configuration c ON c.id = dma.configuration_id + JOIN public.sms_device_property dp ON dp.id = dsl.device_property_id +WHERE dsl.datasource_id = %(tsm_schema)s AND c.is_public AND d.is_public; + +--- CREATE VIEW OBS_PROPERTIES --- +DROP VIEW IF EXISTS "OBS_PROPERTIES" CASCADE; +CREATE OR REPLACE VIEW "OBS_PROPERTIES" AS +SELECT DISTINCT mq.id as "ID", + mq.term as "NAME", + mq.provenance_uri "DEFINITION", + mq.definition as "DESCRIPTION", + jsonb_build_object('url', %(cv_url)s || 'api/v1/measuredquantities/' || mq.id) + as "PROPERTIES" +FROM public.sms_cv_measured_quantity mq + JOIN public.sms_device_property dp + ON mq.id = reverse(split_part(reverse(dp.property_uri), '/', 2))::int + JOIN public.sms_device_mount_action dma ON dp.device_id = dma.device_id + JOIN public.sms_configuration c ON dma.configuration_id = c.id + JOIN public.sms_device d ON dma.device_id = d.id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + LEFT JOIN public.sms_configuration_static_location_begin_action csl + on dma.configuration_id = csl.configuration_id + LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl + on dma.configuration_id = cdl.configuration_id +WHERE (cdl.configuration_id IS NOT NULL OR csl.configuration_id IS NOT NULL) + AND c.is_public + AND d.is_public + AND dp.property_uri <> '' + AND dsl.datasource_id = %(tsm_schema)s; + +COMMIT; diff --git a/src/timeio/CreateThingInDatabaseAction/postgres-ddl.sql b/src/timeio/CreateThingInDatabaseAction/postgres-ddl.sql new file mode 100644 index 00000000..e7e491f3 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/postgres-ddl.sql @@ -0,0 +1,175 @@ +BEGIN; + +-- +-- Create model Thing +-- +CREATE TABLE IF NOT EXISTS "thing" +( + "id" bigserial NOT NULL PRIMARY KEY, + "name" varchar(200) NOT NULL, + "uuid" uuid NOT NULL UNIQUE, + "description" text NULL, + "properties" jsonb NULL +); + + +-- +-- Create model Journal +-- +CREATE TABLE IF NOT EXISTS "journal" +( + "id" bigserial NOT NULL PRIMARY KEY, + "timestamp" timestamp NOT NULL, +-- Added below by ALTER TABLE +-- "timestamp" timestamp with time zone NOT NULL, + "level" varchar(30) NOT NULL, + "message" text NULL, + "extra" jsonb NULL, + "thing_id" bigint NOT NULL, +-- Added below by ALTER TABLE +-- "origin" varchar(200) NULL + + CONSTRAINT "journal_thing_id_fk_thing_id" FOREIGN KEY ("thing_id") REFERENCES "thing" ("id") DEFERRABLE INITIALLY DEFERRED +); +ALTER TABLE "journal" + ADD COLUMN IF NOT EXISTS "origin" varchar(200) NULL; +ALTER TABLE "journal" + ALTER COLUMN "timestamp" TYPE timestamp with time zone; +-- Conditionally create the index if it does not exist +DO +$$ + BEGIN + IF NOT EXISTS (SELECT 1 + FROM pg_indexes + WHERE tablename = 'journal' + AND indexname = 'journal_thing_id') THEN + EXECUTE 'CREATE INDEX "journal_thing_id" ON "journal" ("thing_id")'; + END IF; + END +$$; + + +-- +-- Create model Datastream +-- +CREATE TABLE IF NOT EXISTS "datastream" +( + "id" bigserial NOT NULL PRIMARY KEY, + "name" varchar(200) NOT NULL, + "description" text NULL, + "properties" jsonb NULL, + "position" varchar(200) NOT NULL, + "thing_id" bigint NOT NULL, + + CONSTRAINT "datastream_thing_id_position_9f2cfe68_uniq" UNIQUE ("thing_id", "position"), + CONSTRAINT "datastream_thing_id_f55522a4_fk_thing_id" FOREIGN KEY ("thing_id") REFERENCES "thing" ("id") DEFERRABLE INITIALLY DEFERRED +); +-- Conditionally create the index if it does not exist +DO +$$ + BEGIN + IF NOT EXISTS (SELECT 1 + FROM pg_indexes + WHERE tablename = 'datastream' + AND indexname = 'datastream_thing_id_f55522a4') THEN + EXECUTE 'CREATE INDEX "datastream_thing_id_f55522a4" ON "datastream" ("thing_id")'; + END IF; + END +$$; + + +-- +-- Create model Observation +-- +CREATE TABLE IF NOT EXISTS "observation" +( +-- "id" bigserial NOT NULL PRIMARY KEY, + "phenomenon_time_start" timestamp with time zone NULL, + "phenomenon_time_end" timestamp with time zone NULL, + "result_time" timestamp with time zone NOT NULL, + "result_type" smallint NOT NULL, + "result_number" double precision NULL, + "result_string" varchar(200) NULL, + "result_json" jsonb NULL, + "result_boolean" boolean NULL, + "result_latitude" double precision NULL, + "result_longitude" double precision NULL, + "result_altitude" double precision NULL, + "result_quality" jsonb NULL, + "valid_time_start" timestamp with time zone NULL, + "valid_time_end" timestamp with time zone NULL, + "parameters" jsonb NULL, + "datastream_id" bigint NOT NULL, + + CONSTRAINT "observation_datastream_id_result_time_1d043396_uniq" UNIQUE ("datastream_id", "result_time"), + CONSTRAINT "observation_datastream_id_77f5c4fb_fk_datastream_id" FOREIGN KEY ("datastream_id") REFERENCES "datastream" ("id") DEFERRABLE INITIALLY DEFERRED +); +-- -- Conditionally make a TimescaleDB hypertable if not already done +-- DO +-- $$ +-- BEGIN +-- IF NOT EXISTS (SELECT 1 +-- FROM timescaledb_information.hypertables +-- WHERE hypertable_name = 'observation' +-- AND hypertable_schema = current_schema()) THEN +-- -- The table is not a hypertable, so create it +-- PERFORM public.create_hypertable('observation', 'result_time'); +-- END IF; +-- END +-- $$; +-- Conditionally create the index if it does not exist +DO +$$ + BEGIN + IF NOT EXISTS (SELECT 1 + FROM pg_indexes + WHERE tablename = 'observation' + AND indexname = 'observation_datastream_id_77f5c4fb') THEN + EXECUTE 'CREATE INDEX "observation_datastream_id_77f5c4fb" ON "observation" ("datastream_id")'; + END IF; + END +$$; + + +-- +-- Create model Relation role +-- +CREATE TABLE IF NOT EXISTS relation_role +( + id bigint GENERATED BY DEFAULT AS IDENTITY, + name varchar(200) NOT NULL, + definition text NULL, + inverse_name varchar(200) NOT NULL, + inverse_definition text NULL, + description text NULL, + properties jsonb NULL, + + CONSTRAINT relation_role_pk PRIMARY KEY (id), + CONSTRAINT relation_role_pk_2 UNIQUE (name), + CONSTRAINT relation_role_pk_3 UNIQUE (inverse_name) +); + + +-- +-- Create model Related datastreams +-- +CREATE TABLE IF NOT EXISTS related_datastream +( + id bigint GENERATED ALWAYS AS IDENTITY, + datastream_id bigint NOT NULL, + role_id bigint NOT NULL, + target_id bigint NOT NULL, + + CONSTRAINT related_datastream_pk PRIMARY KEY (id), + CONSTRAINT related_datastream_pk_2 UNIQUE (datastream_id, role_id, target_id), + CONSTRAINT related_datastream_datastream_id_fk FOREIGN KEY (datastream_id) REFERENCES datastream (id) DEFERRABLE INITIALLY DEFERRED, + CONSTRAINT related_datastream_datastream_id_fk_2 FOREIGN KEY (target_id) REFERENCES datastream (id) DEFERRABLE INITIALLY DEFERRED, + CONSTRAINT related_datastream_relation_role_id_fk FOREIGN KEY (role_id) REFERENCES relation_role (id) DEFERRABLE INITIALLY DEFERRED +); +-- cannot be done inside the CREATE TABLE statement but is idempotent +COMMENT ON COLUMN related_datastream.datastream_id IS 'Domain, source or set of departure of the relation.'; +COMMENT ON COLUMN related_datastream.role_id IS 'The type of the relation.'; +COMMENT ON COLUMN related_datastream.target_id IS 'Codomain, target or set of destination of the relation.'; + + +COMMIT; diff --git a/src/timeio/CreateThingInDatabaseAction/postgres-dml.sql b/src/timeio/CreateThingInDatabaseAction/postgres-dml.sql new file mode 100644 index 00000000..88b43db6 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/postgres-dml.sql @@ -0,0 +1,14 @@ +BEGIN; + +INSERT INTO relation_role + (id, name, definition, inverse_name, inverse_definition, description, properties) +VALUES + (1, 'created_by', 'This was created by other(s)', 'created', 'Other(s) created this', 'A derived product', null) +ON CONFLICT (id) DO UPDATE SET + name = excluded.name, + definition = excluded.definition, + inverse_name = excluded.inverse_name, + inverse_definition = excluded.inverse_definition, + description = excluded.description, + properties = excluded.properties; +COMMIT; diff --git a/src/timeio/CreateThingInDatabaseAction/sta/datastream.sql b/src/timeio/CreateThingInDatabaseAction/sta/datastream.sql new file mode 100644 index 00000000..0bc55ed5 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/datastream.sql @@ -0,0 +1,144 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "DATASTREAMS" CASCADE; +CREATE OR REPLACE VIEW "DATASTREAMS" AS +SELECT + dsl.device_property_id AS "ID", + concat(c.label, ':', + d.short_name, ':', + dp.property_name + ) as "NAME", + concat(d.short_name, ' ', + dp.property_name, ' ', + dma.offset_z, 'm at site ', + c.label, ' with aggregation function ', + dp.aggregation_type_name, ' and period ', + dsl.aggregation_period , 's' + ) as "DESCRIPTION", + c.id as "THING_ID", + d.id as "SENSOR_ID", + CASE + WHEN dp.property_uri = '' THEN NULL + ELSE reverse(split_part(reverse(dp.property_uri::text), '/'::text, + 2))::integer + END as "OBS_PROPERTY_ID", + dp.unit_uri AS "UNIT_DEFINITION", + dp.property_name AS "UNIT_NAME", + dp.unit_name AS "UNIT_SYMBOL", + 'OM_Observation' as "OBSERVATION_TYPE", + jsonb_build_object( + 'name', '', + 'symbol', dp.unit_name, + 'definition', dp.unit_uri + ) as "UNIT_OF_MEASUREMENT", + public.st_asgeojson( + public.st_convexhull( + public.ST_SetSRID( + public.ST_MakePoint(csl.x, csl.y), + 4326 + ) + ) + ) as "OBSERVED_AREA", + null as "RESULT_TIME", + null as "PHENOMENON_TIME", + dma.begin_date AS "PHENOMENON_TIME_START", + dma.begin_date AS "RESULT_TIME_START", + dma.end_date AS "PHENOMENON_TIME_END", + dma.end_date AS "RESULT_TIME_END", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.id', %(sms_url)s || 'datastream-links/' || MAX(dsl.id), + 'jsonld.type', 'DatastreamProperties', + 'observingProcedure', jsonb_build_object( + 'jsonld.type', 'ObservingProcedure', + 'name', dp.aggregation_type_name, + 'description', cv_agg.definition, + 'definition', dp.aggregation_type_uri, + 'properties', jsonb_build_object( + 'period', dsl.aggregation_period, + 'unitOfPeriod', jsonb_build_object( + 'jsonld.type', 'Unit', + 'name', cv_u.provenance, + 'symbol', cv_u.term, + 'definition', dp.unit_uri + ) + ) + ), + 'measurementProperties', jsonb_build_object( + 'jsonld.type', 'MeasurementProperties', + 'measurementResolution', dp.resolution , + 'unitOfMeasurementResolution', jsonb_build_object( + 'jsonld.type', 'Unit', + 'name', cv_ur.provenance, + 'symbol', dp.resolution_unit_name, + 'definition', dp.resolution_unit_uri + ), + 'measurementAccuracy', dp.accuracy, + 'unitOfMeasurementAccuracy', jsonb_build_object( + 'jsonld.type', 'Unit', + 'name', cv_ua.provenance, + 'symbol', dp.accuracy_unit_name , + 'definition', dp.accuracy_unit_uri + ), + 'operationRange', array[dp.measuring_range_min,dp.measuring_range_max], + 'unitOfOperationRange', jsonb_build_object( + 'jsonld.type', 'Unit', + 'name', cv_ua.provenance, + 'symbol', dp.accuracy_unit_name , + 'definition', dp.accuracy_unit_uri + ) + ), + 'license', jsonb_build_object( + 'jsonld.type', 'CreativeWork', + 'name', cv_l.term, + 'url', cv_l.provenance_uri, + 'provider', cv_l.definition + ), + 'providerMobility', CASE WHEN MAX(cdl.begin_date) IS NULL THEN 'static' ELSE 'dynamic' end, + 'deployment', jsonb_build_object( + 'jsonld.id', %(sms_url)s || 'configurations/' || c.id || '/platforms-and-devices?deviceMountAction=' || dma.id, + 'jsonld.type', 'Deployment', + 'name', dma."label", + 'description', dma.begin_description, + 'deploymentTime', dma.begin_date, + 'properties', jsonb_build_object( + 'jsonld.type', 'DeploymentProperties', + 'offsets', jsonb_build_object( + 'jsonld.type', 'Offset', + 'x', dma.offset_x, + 'y', dma.offset_y, + 'z', dma.offset_z + ), + 'unitOfOffsets', jsonb_build_object( + 'jsonld.type', 'Unit', + 'name', 'meters', + 'symbol', 'm', + -- this should be generated automatically. we need to find the unit id for meter + 'definition', 'https://sms-cv.helmholtz.cloud/sms/cv/api/v1/units/63' + ) + ) + ), + 'dataSource', '' + ) as "PROPERTIES" +FROM public.sms_datastream_link dsl +JOIN public.sms_device_mount_action dma ON dma.id = dsl.device_mount_action_id +JOIN public.sms_device d ON d.id = dma.device_id +JOIN public.sms_configuration c ON c.id = dma.configuration_id +JOIN public.sms_device_property dp ON dp.id = dsl.device_property_id +LEFT JOIN public.sms_cv_aggregation_type cv_agg ON coalesce(nullif(split_part(dp.aggregation_type_uri,'/',9),'')::integer) =cv_agg.id +LEFT JOIN public.sms_cv_unit cv_u ON coalesce(nullif(split_part(dp.unit_uri ,'/',9),'')::integer) =cv_u.id +LEFT JOIN public.sms_cv_unit cv_ur ON coalesce(nullif(split_part(dp.resolution_unit_uri ,'/',9),'')::integer) =cv_ur.id +LEFT JOIN public.sms_cv_unit cv_ua ON coalesce(nullif(split_part(dp.accuracy_unit_uri ,'/',9),'')::integer) =cv_ua.id +LEFT JOIN public.sms_cv_license cv_l ON coalesce(nullif(split_part(dsl.license_uri,'/',9),'')::integer) =cv_l.id +LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl ON c.id = cdl.configuration_id +LEFT JOIN public.sms_configuration_static_location_begin_action csl ON c.id = csl.configuration_id +WHERE c.is_public AND d.is_public AND dsl.datasource_id = %(tsm_schema)s +GROUP BY dsl.device_property_id, c.label, d.short_name, dp.property_name, dma.offset_z, dp.aggregation_type_name, dsl.aggregation_period, + dp.unit_name, dp.unit_uri, d.id, dp.id, cv_agg.definition, dp.aggregation_type_uri, cv_u.provenance, cv_u.term, dp.resolution, cv_ur.provenance, + dp.resolution_unit_name, dp.resolution_unit_uri, dp.accuracy, cv_ua.provenance, dp.accuracy_unit_name, dp.accuracy_unit_uri, dp.measuring_range_min, + dp.measuring_range_max, cv_l.term, cv_l.provenance_uri, cv_l.definition, c.id, dma.id, dma.label, dma.begin_description, dma.begin_date, dma.offset_x, + dma.offset_y, csl.x, csl.y, dp.property_uri, dma.end_date; + +COMMIT; \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/sta/feature.sql b/src/timeio/CreateThingInDatabaseAction/sta/feature.sql new file mode 100644 index 00000000..765d34d2 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/feature.sql @@ -0,0 +1,33 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_name = 'FEATURES' + AND table_schema = %(tsm_schema)s + AND table_type = 'BASE TABLE') + THEN EXECUTE 'DROP TABLE "FEATURES" CASCADE'; + ELSIF EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_name = 'FEATURES' + AND table_schema = %(tsm_schema)s + AND table_type = 'VIEW' + ) + THEN EXECUTE 'DROP VIEW "FEATURES" CASCADE'; + END IF; +END $$; +CREATE TABLE "FEATURES" ( + "ID" serial, + "NAME" text, + "DESCRIPTION" text, + "ENCODING_TYPE" text, + "FEATURE" jsonb, + "PROPERTIES" jsonb +); + +COMMIT; \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/sta/location.sql b/src/timeio/CreateThingInDatabaseAction/sta/location.sql new file mode 100644 index 00000000..6d6af57c --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/location.sql @@ -0,0 +1,108 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "LOCATIONS" AS +SELECT DISTINCT + csl.id AS "ID", + csl.label AS "NAME", + csl.begin_description AS "DESCRIPTION", + 'application/geo+json'::text AS "ENCODING_TYPE", + public.ST_ASGeoJSON( + public.ST_SetSRID( + public.ST_MakePoint(csl.x, csl.y), + 4326 + ) + ) AS "LOCATION", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.id', %(sms_url)s || 'configurations/' || c.id || 'locations/static-location-actions/' || csl.id, + 'jsonld.type', 'LocationProperties' + ) AS "PROPERTIES" +FROM public.sms_configuration_static_location_begin_action csl +JOIN public.sms_configuration c ON csl.configuration_id = c.id +JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id +JOIN public.sms_device d ON dma.device_id = d.id +JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id +WHERE dsl.datasource_id = %(tsm_schema)s +AND c.is_public AND d.is_public +ORDER BY csl.id; + + +--- CREATE VIEW THINGS_LOCATIONS --- +DROP VIEW IF EXISTS "THINGS_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "THINGS_LOCATIONS" AS +SELECT DISTINCT ON (c.id) + c.id AS "THING_ID", + csl.id AS "LOCATION_ID" +FROM public.sms_configuration c +JOIN public.sms_configuration_static_location_begin_action csl +ON c.id = csl.configuration_id +JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id +JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id +JOIN public.sms_device d ON dma.device_id = d.id +WHERE dsl.datasource_id = %(tsm_schema)s +AND c.is_public AND d.is_public +ORDER BY c.id, csl.begin_date DESC; + + +DROP VIEW IF EXISTS "LOCATIONS_HIST_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "LOCATIONS_HIST_LOCATIONS" AS +--build cte that returns configuration_ids and location_ids (configuration_static_location_begin_action.id) +WITH locations AS ( +SELECT DISTINCT + c.id AS c_id, + csl.id AS csl_id +FROM public.sms_configuration c +JOIN public.sms_configuration_static_location_begin_action csl ON c.id = csl.configuration_id +JOIN public.sms_device_mount_action dma on c.id = dma.configuration_id +JOIN public.sms_device d ON dma.device_id = d.id +JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id +WHERE dsl.datasource_id = %(tsm_schema)s +AND c.is_public AND d.is_public +), +current_locations AS ( +SELECT + c_id, + MAX(csl_id) AS max_csl_id +FROM locations +GROUP BY c_id +) + +SELECT + cl.max_csl_id AS "LOCATION_ID", + loc.csl_id AS "HIST_LOCATION_ID" +FROM locations loc +-- join locations on current configuration location +JOIN current_locations cl ON loc.c_id = cl.c_id +ORDER BY cl.max_csl_id ASC, loc.csl_id ASC +--returns hist_location_id mapped to current location_id for each configuration_id +; + +DROP VIEW IF EXISTS "HIST_LOCATIONS" CASCADE; +CREATE OR REPLACE VIEW "HIST_LOCATIONS" AS + +WITH cte AS ( +SELECT + c.id AS "THING_ID", + csl.id "ID", + csl.begin_date AS "TIME", +ROW_NUMBER() OVER (PARTITION BY c.id ORDER BY csl.begin_date DESC) AS row_num +FROM public.sms_configuration c +JOIN public.sms_configuration_static_location_begin_action csl ON c.id = csl.configuration_id +JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id +JOIN public.sms_device d ON dma.device_id = d.id +JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id +WHERE c.is_public AND d.is_public +AND dsl.datasource_id = %(tsm_schema)s +) + +SELECT DISTINCT "THING_ID", + "ID", + "TIME" +FROM cte +WHERE row_num > 1; + + +COMMIT; \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/sta/observation.sql b/src/timeio/CreateThingInDatabaseAction/sta/observation.sql new file mode 100644 index 00000000..9131a7dc --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/observation.sql @@ -0,0 +1,36 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "OBSERVATIONS" CASCADE; +CREATE OR REPLACE VIEW "OBSERVATIONS" AS +SELECT + o.result_boolean AS "RESULT_BOOLEAN", + o.result_quality AS "RESULT_QUALITY", + o.result_time AS "PHENOMENON_TIME_START", + jsonb_build_object() AS "PARAMETERS", + dsl.device_property_id AS "DATASTREAM_ID", + o.result_string AS "RESULT_STRING", + o.result_type AS "RESULT_TYPE", + o.valid_time_end AS "VALID_TIME_END", + o.result_time AS "PHENOMENON_TIME_END", + null AS "FEATURE_ID", + row_number() OVER () AS "ID", + o.result_json AS "RESULT_JSON", + o.result_time AS "RESULT_TIME", + o.result_number AS "RESULT_NUMBER", + o.valid_time_start AS "VALID_TIME_START", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.type', 'ObservationProperties', + 'dataSource', NULL + ) AS "PROPERTIES" +FROM public.sms_datastream_link dsl +JOIN observation o ON o.datastream_id = dsl.datastream_id +JOIN public.sms_device_mount_action dma ON dma.id = dsl.device_mount_action_id +JOIN public.sms_device d ON d.id = dma.device_id +JOIN public.sms_configuration c ON c.id = dma.configuration_id +WHERE c.is_public AND d.is_public AND dsl.datasource_id = %(tsm_schema)s +AND o.result_time BETWEEN dsl.begin_date AND COALESCE(dsl.end_date, 'infinity'::timestamp); + +COMMIT; diff --git a/src/timeio/CreateThingInDatabaseAction/sta/observed_property.sql b/src/timeio/CreateThingInDatabaseAction/sta/observed_property.sql new file mode 100644 index 00000000..4f7b06a2 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/observed_property.sql @@ -0,0 +1,28 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "OBS_PROPERTIES" CASCADE; +CREATE OR REPLACE VIEW "OBS_PROPERTIES" AS +SELECT DISTINCT + mq.id as "ID", + mq.term as "NAME", + mq.provenance_uri "DEFINITION", + mq.definition as "DESCRIPTION", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.id', %(cv_url)s || 'api/v1/measuredquantities/' || mq.id, + 'jsonld.type', 'ObservedPropertyProperties' + ) as "PROPERTIES" +FROM public.sms_cv_measured_quantity mq +JOIN public.sms_device_property dp ON mq.id = reverse(split_part(reverse(dp.property_uri), '/', 2))::int +JOIN public.sms_device_mount_action dma ON dp.device_id = dma.device_id +JOIN public.sms_configuration c ON dma.configuration_id = c.id +JOIN public.sms_device d ON dma.device_id = d.id +JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id +LEFT JOIN public.sms_configuration_static_location_begin_action csl ON dma.configuration_id = csl.configuration_id +LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl ON dma.configuration_id = cdl.configuration_id +WHERE (cdl.configuration_id IS NOT NULL OR csl.configuration_id IS NOT NULL) + AND c.is_public AND d.is_public AND dp.property_uri <> '' AND dsl.datasource_id = %(tsm_schema)s; + +COMMIT; \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/sta/schema_context.sql b/src/timeio/CreateThingInDatabaseAction/sta/schema_context.sql new file mode 100644 index 00000000..d51e4d58 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/schema_context.sql @@ -0,0 +1,20 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +CREATE OR REPLACE function public.get_schema_org_context() RETURNS jsonb AS +$$ +BEGIN +RETURN +'{ + "@version": "1.1", + "@import": "stamplate.jsonld", + "@vocab": "http://schema.org/" +}'::jsonb; +END; +$$ +language plpgsql; + +COMMIT; + + diff --git a/src/timeio/CreateThingInDatabaseAction/sta/sensor.sql b/src/timeio/CreateThingInDatabaseAction/sta/sensor.sql new file mode 100644 index 00000000..5a1c1295 --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/sensor.sql @@ -0,0 +1,84 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "SENSORS" CASCADE; +CREATE OR REPLACE VIEW "SENSORS" AS WITH +device_role_responsible_persons AS ( + SELECT DISTINCT + d.id AS "device_id", + dcr.role_name AS "role_name", + dcr.role_uri AS "role_uri", + array_agg(DISTINCT jsonb_build_object( + 'jsonld.id', %(sms_url)s || 'contacts/' || co.id, + 'jsonld.type', 'Person', + 'givenName', co.given_name, + 'familyName', co.family_name, + 'email', co.email, + 'affiliation', jsonb_build_object( + 'jsonld.type', 'Organization', + 'name', co.organization, + 'identifier', NULL + ), + 'identifier', co.orcid + )) AS "responsible_persons" +FROM public.sms_device d +JOIN public.sms_device_contact_role dcr ON dcr.device_id = d.id +JOIN public.sms_contact co ON dcr.contact_id = co.id +GROUP BY d.id, dcr.role_name, dcr.role_uri +), +device_properties AS ( + SELECT DISTINCT + d.id AS "device_id", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.id', %(sms_url)s || 'devices/' || d.id, + 'jsonld.type', 'SensorProperties', + 'identifier', d.persistent_identifier, + 'isVariantOf', jsonb_build_object( + 'jsonld.type', 'ProductGroup', + 'name', d.device_type_name, + 'definition', d.device_type_uri + ), + 'isVirtual', false, + 'model', d.model, + 'manufacturer', jsonb_build_object( + 'jsonld.type', 'Organization', + 'name', d.manufacturer_name, + 'definition', d.manufacturer_uri + ), + 'serialNumber', d.serial_number, + 'responsiblePersons', array_agg(DISTINCT jsonb_build_object( + 'jsonld.type', 'Person', + 'roleName', drrp.role_name, + 'definition', drrp.role_uri, + 'resonsiblePersons', drrp.responsible_persons + )) + ) AS "properties" + FROM public.sms_device d + JOIN device_role_responsible_persons drrp ON d.id = drrp.device_id + JOIN public.sms_device_mount_action dma ON d.id = dma.device_id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + JOIN public.sms_configuration c ON dma.configuration_id = c.id + LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl ON c.id = cdl.configuration_id + LEFT JOIN public.sms_configuration_static_location_begin_action csl ON c.id = csl.configuration_id + GROUP BY d.id, d.persistent_identifier, d.device_type_name, d.device_type_uri, d.model, + d.manufacturer_name, d.manufacturer_uri, d.serial_number, c.is_public, d.is_public, + cdl.configuration_id, csl.configuration_id, dsl.datasource_id + HAVING ((cdl.configuration_id IS NOT NULL) OR (csl.configuration_id IS NOT NULL)) + AND d.is_public + AND c.is_public + AND dsl.datasource_id = %(tsm_schema)s +) +SELECT + d.id AS "ID", + d.short_name AS "NAME", + d.description AS "DESCRIPTION", + 'html' AS "ENCODING_TYPE", + %(sms_url)s || 'backend/api/v1/devices/' || d.id || '/sensorml' AS "METADATA", + dp.properties AS "PROPERTIES" +FROM public.sms_device d +JOIN device_properties dp ON d.id = dp.device_id +ORDER BY d.id ASC; + +COMMIT; \ No newline at end of file diff --git a/src/timeio/CreateThingInDatabaseAction/sta/thing.sql b/src/timeio/CreateThingInDatabaseAction/sta/thing.sql new file mode 100644 index 00000000..90d5fb2f --- /dev/null +++ b/src/timeio/CreateThingInDatabaseAction/sta/thing.sql @@ -0,0 +1,84 @@ +BEGIN; + +SET search_path TO %(tsm_schema)s; + +DROP VIEW IF EXISTS "THINGS" CASCADE; +CREATE OR REPLACE VIEW "THINGS" AS +WITH + configuration_role_responsible_persons AS ( + SELECT + c.id AS "configuration_id", + ccr.role_name AS "role_name", + ccr.role_uri AS "role_uri", + array_agg(DISTINCT jsonb_build_object( + 'jsonld.id', %(sms_url)s || 'contacts/' || co.id, + 'jsonld.type', 'Person', + 'givenName', co.given_name, + 'familyName', co.family_name, + 'email', co.email, + 'affiliation', jsonb_build_object( + 'jsonld.type', 'Organization', + 'name', co.organization, + 'identifier', NULL + ), + 'identifier', co.orcid + ) + ) AS "responsible_persons" + FROM public.sms_configuration c + JOIN public.sms_configuration_contact_role ccr ON c.id = ccr.configuration_id + JOIN public.sms_contact co ON ccr.contact_id = co.id + GROUP BY c.id, ccr.role_name, ccr.role_uri + ), + configuration_properties AS ( + SELECT + c.id AS "configuration_id", + jsonb_build_object( + '@context', public.get_schema_org_context(), + 'jsonld.id', %(sms_url)s || 'configurations/' || c.id, + 'jsonld.type', 'ThingProperties', + 'identifier', c.persistent_identifier, + 'responsiblePersons', array_agg(DISTINCT jsonb_build_object( + 'jsonld.type', 'Role', + 'roleName', crrp.role_name, + 'definition', crrp.role_uri, + 'resonsiblePersons', crrp.responsible_persons + )), + 'partOfProjects', array_agg(DISTINCT jsonb_build_object( + 'jsonld.type', 'Project', + 'name', c.project + )), + 'metadata', jsonb_build_object('jsonld.type', 'Dataset', + 'encodingType', 'http://www.opengis.net/doc/IS/SensorML/2.0', + 'distribution', jsonb_build_object( + 'jsonld.type', 'DataDistributionService', + 'url', %(sms_url)s || 'cbackend/api/v1/configurations/' || c.id || '/sensorml' + ) + ), + 'isVirtual', false + ) AS "properties" + FROM public.sms_configuration c + JOIN public.sms_configuration_contact_role ccr ON c.id = ccr.configuration_id + JOIN public.sms_contact co ON ccr.contact_id = co.id + JOIN public.sms_device_mount_action dma ON c.id = dma.configuration_id + JOIN public.sms_device d ON dma.device_id = d.id + JOIN public.sms_datastream_link dsl ON dma.id = dsl.device_mount_action_id + JOIN configuration_role_responsible_persons crrp ON c.id = crrp.configuration_id + LEFT JOIN public.sms_configuration_dynamic_location_begin_action cdl ON c.id = cdl.configuration_id + LEFT JOIN public.sms_configuration_static_location_begin_action csl ON c.id = csl.configuration_id + GROUP BY + c.id, c.persistent_identifier, c.project, cdl.configuration_id, csl.configuration_id, + c.is_public, d.is_public, dsl.datasource_id + HAVING + ((cdl.configuration_id IS NOT NULL) OR (csl.configuration_id IS NOT NULL)) + AND c.is_public AND d.is_public AND dsl.datasource_id = %(tsm_schema)s + ) +SELECT DISTINCT + c.id AS "ID", + c.description AS "DESCRIPTION", + c.label AS "NAME", + cp.properties AS "PROPERTIES" +FROM public.sms_configuration c +JOIN configuration_properties cp ON c.id = cp.configuration_id +ORDER BY c.id ASC; + +COMMIT; \ No newline at end of file diff --git a/src/timeio/__init__.py b/src/timeio/__init__.py new file mode 100644 index 00000000..e5a0d9b4 --- /dev/null +++ b/src/timeio/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/src/timeio/base_handler.py b/src/timeio/base_handler.py new file mode 100644 index 00000000..6d304dfa --- /dev/null +++ b/src/timeio/base_handler.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import json +import logging +import sys +import traceback +import typing +from abc import ABC, abstractmethod + +import paho.mqtt.client as mqtt +from paho.mqtt.client import MQTTMessage + +from utils.errors import ( + UserInputError, + DataNotFoundError, + ParsingError, + NoDataWarning, + ProcessingError, +) + +logger = logging.getLogger("mqtt-handler") + + +class AbstractHandler(ABC): + def __init__( + self, + topic: str, + mqtt_broker: str, + mqtt_user: str, + mqtt_password: str, + mqtt_client_id: str, + mqtt_qos: int, + mqtt_clean_session: bool, + ): + self.topic = topic + self.mqtt_broker = mqtt_broker + self.mqtt_user = mqtt_user + self.mqtt_password = mqtt_password + self.mqtt_client_id = mqtt_client_id + self.mqtt_qos = mqtt_qos + self.mqtt_clean_session = mqtt_clean_session + self.mqtt_host = mqtt_broker.split(":")[0] + self.mqtt_port = int(mqtt_broker.split(":")[1]) + self.mqtt_client = mqtt.Client( + client_id=mqtt_client_id, + clean_session=self.mqtt_clean_session, + ) + self.mqtt_client.suppress_exceptions = False + self.mqtt_client.username_pw_set(self.mqtt_user, self.mqtt_password) + self.mqtt_client.on_connect = self.on_connect + self.mqtt_client.on_subscribe = self.on_subscribe + self.mqtt_client.on_message = self.on_message + self.mqtt_client.on_log = self.on_log + + def run_loop(self) -> typing.NoReturn: + logger.info(f"Setup ok, starting listening loop") + self.mqtt_client.connect(self.mqtt_host, self.mqtt_port) + self.mqtt_client.subscribe(self.topic, self.mqtt_qos) + self.mqtt_client.loop_forever() + + def on_log(self, client: mqtt.Client, userdata, level, buf): + logger.debug(f"%s: %s", level, buf) + + def on_connect(self, client: mqtt.Client, userdata, flags, rc): + if rc == 0: + logger.info( + f"Connected to %r with client ID: %s", + self.mqtt_broker, + self.mqtt_client._client_id.decode(), + ) + return + logger.error(f"Failed to connect to %r, return code: %s", self.mqtt_broker, rc) + + def on_subscribe(self, client: mqtt.Client, userdata, mid, granted_qos): + logger.info(f"Subscribed to topic {self.topic} with QoS {granted_qos[0]}") + + def on_message(self, client: mqtt.Client, userdata, message: MQTTMessage): + logger.info( + "\n\n======================= NEW MESSAGE ========================\n" + f"Topic: %r, QoS: %s, Timestamp: %s", + message.topic, + message.qos, + message.timestamp, + ) + + try: + content = self._decode(message) + except Exception: + logger.critical( + f"\n====================== CRITICAL ERROR ======================\n" + f"Status: PARSING ERROR (Decoding/parsing of payload failed)\n" + f"Payload:\n{message.payload!r}\n" + f"{traceback.format_exc()}" + f"========================= SYS EXIT =========================\n", + ) + # We exit now, because otherwise the client.on_log would print + # the exception again (with unnecessary clutter) + sys.exit(1) + + try: + logger.debug(f"calling %s.act()", self.__class__.__qualname__) + self.act(content, message) + + except (UserInputError, ParsingError): + logger.error( + f"\n======================== USER ERROR ========================\n" + f"Status: ERROR (An error because of user data or input)\n" + f"Content:\n{content!r}\n" + f"{traceback.format_exc()}" + f"======================== USER ERROR ========================\n", + ) + return + except (DataNotFoundError, NoDataWarning): + logger.error( + f"\n======================== DATA ERROR ========================\n" + f"Status: ERROR (Data is missing)\n" + f"Content:\n{content!r}\n" + f"{traceback.format_exc()}" + f"======================== DATA ERROR ========================\n", + ) + return + + except Exception: + logger.critical( + f"\n====================== CRITICAL ERROR ======================\n" + f"Status: UNHANDLED ERROR (See exception and traceback below)\n" + f"Content:\n{content!r}\n" + f"{traceback.format_exc()}" + f"========================= SYS EXIT =========================\n", + ) + # We exit now, because otherwise the client.on_log would print + # the exception again (with unnecessary clutter) + sys.exit(1) + + logger.info( + f"\n===================== PROCESSING DONE ======================\n" + f"Status: Success (Message was processed successfully)\n" + f"===================== PROCESSING DONE ======================\n", + ) + + def _decode(self, message: MQTTMessage) -> typing.Any: + """ + This decodes the message from utf-8 and also try to decode json to python + objects. If the object is not json (e.g. plain strings or a datetime object + like 2022-22-22T11:11:11) the object itself is returned instead. + + Parameters + ---------- + message : MQTTMessage + Message to decode. + + Returns + ------- + content: + The decoded content + + Raises + ------ + UnicodeDecodeError + If the raw message is not 'utf-8' encoded. + """ + # Hint: json.loads also decodes single numeric values, + # the constants `null`, +/-`Infinity` and `NaN`. + decoded: str = message.payload.decode("utf-8") + try: + decoded = json.loads(decoded) + except json.JSONDecodeError: + logger.warning( + f"Message content is not valid json. (That's ok, but unusual)" + ) + return decoded + + @abstractmethod + def act(self, content: typing.Any, message: MQTTMessage): + """ + Subclasses must overwrite this function. + + The calling function will handle the following exceptions: + - utils.errors.ParsingError + - utils.errors.UserInputError + - utils.errors.DataNotFoundError + - utils.errors.NoDataWarning + + Other exceptions will lead to a system exit. + """ + + raise NotImplementedError diff --git a/src/timeio/check.py b/src/timeio/check.py new file mode 100644 index 00000000..83298dc8 --- /dev/null +++ b/src/timeio/check.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# test if we can import everything without +# errors, but we ignore warnings +import warnings + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from base_handler import * # noqa + from crontab_setup import * # noqa + from databases import * # noqa + from db_setup import * # noqa + from file_ingest import * # noqa + from frost import * # noqa + from frost_setup import * # noqa + from grafana_dashboard_setup import * # noqa + from grafana_user_setup import * # noqa + from minio_setup import * # noqa + from mqtt_ingest import * # noqa + from mqtt_user_setup import * # noqa + from parsing import * # noqa + from thing import * # noqa + from version import __version__ + +if __name__ == "__main__": + print(__version__) diff --git a/src/timeio/crontab_setup.py b/src/timeio/crontab_setup.py new file mode 100644 index 00000000..4cf8fe80 --- /dev/null +++ b/src/timeio/crontab_setup.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import logging +from datetime import datetime +from random import randint + +from crontab import CronItem, CronTab + +from base_handler import AbstractHandler, MQTTMessage +from thing import Thing +from utils import get_envvar, setup_logging +from utils.journaling import Journal + +logger = logging.getLogger("crontab-setup") +journal = Journal("Cron") + + +class CreateThingInCrontabHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.tabfile = "/tmp/cron/crontab.txt" + + def act(self, content: dict, message: MQTTMessage): + thing = Thing.get_instance(content) + with CronTab(tabfile=self.tabfile) as crontab: + for job in crontab: + if self.job_belongs_to_thing(job, thing): + logger.info(f"Updating cronjob for thing {thing.name}") + self.update_job(job, thing) + journal.info(f"Updated cronjob", thing.uuid) + return + # if no job was found, create a new one + job = crontab.new() + logger.info(f"Creating job for thing {thing.name}") + info = self.make_job(job, thing) + if not info: + logger.warning( + "no Cronjob was created, because neither extAPI, " + "nor extSFTP is present" + ) + return + crontab.append(job) + journal.info(f"Created cronjob to sync {info}", thing.uuid) + + @classmethod + def make_job(cls, job: CronItem, thing: Thing) -> str: + info = "" + comment = cls.mk_comment(thing) + uuid = thing.uuid + if thing.external_sftp is not None: + interval = int(thing.external_sftp.sync_interval) + schedule = cls.get_schedule(interval) + script = "/scripts/sftp_sync/sftp_sync.py" + keyfile = thing.external_sftp.private_key_path + command = f"{script} {uuid} {keyfile} > $STDOUT 2> $STDERR" + job.enable(enabled=thing.external_sftp.enabled) + job.set_comment(comment, pre_comment=True) + job.setall(schedule) + job.set_command(command) + info = f"sFTP {thing.external_sftp.uri} @ {interval}s" + if thing.external_api is not None: + interval = int(thing.external_api.sync_interval) + schedule = cls.get_schedule(interval) + script = f"/scripts/ext_api_sync/{thing.external_api.api_type}_api_sync.py" + target_uri = thing.database.url + command = f"""{script} {uuid} "{thing.external_api.settings}" {target_uri} > $STDOUT 2> $STDERR""" + job.enable(enabled=thing.external_api.enabled) + job.set_comment(comment, pre_comment=True) + job.setall(schedule) + job.set_command(command) + info = f"{thing.external_api.api_type}-API @ {interval}s" + return info + + # alias + update_job = make_job + + @staticmethod + def job_belongs_to_thing(job: CronItem, thing: Thing) -> bool: + """Check if job belongs to thing.""" + return job.comment.split(" | ")[-1] == thing.uuid + + @staticmethod + def mk_comment(thing: Thing) -> str: + now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return f"{now_str} | {thing.project.name} | {thing.name} | {thing.uuid}" + + @staticmethod + def get_schedule(interval: int) -> str: + # set a random delay to avoid all jobs running at the same time + # maximum delay is the interval or 59, whichever is smaller + delay_m = randint(0, min(interval - 1, 59)) + # interval is smaller than an hour + if interval < 60: + return f"{delay_m}-59/{interval} * * * *" + # interval is smaller than a day + elif interval < 1440: + delay_h = randint(0, min(interval // 60 - 1, 23)) + return f"{delay_m} {delay_h}-23/{interval//60} * * *" + else: + delay_h = randint(0, min(interval // 60 - 1, 23)) + delay_wd = randint(0, min(interval // 1440 - 1, 6)) + return f"{delay_m} {delay_h} * * {delay_wd}-6/{interval//1440}" + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateThingInCrontabHandler().run_loop() diff --git a/src/timeio/databases.py b/src/timeio/databases.py new file mode 100644 index 00000000..bcbcd634 --- /dev/null +++ b/src/timeio/databases.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import logging +import threading +import urllib.request +from functools import partial +from typing import Any, Callable, Literal + +import psycopg +import psycopg2 +import psycopg2.extensions +import requests +from psycopg import Connection, conninfo +from psycopg.rows import dict_row + +import parsing +from utils.errors import DataNotFoundError + + +class Database: + name = "database" + + def __init__(self, dsn: str): + self.info = conninfo.conninfo_to_dict(dsn) + self.info.pop("password") + self.__dsn = dsn + self.ping() + + @property + def connection(self) -> Callable[[], psycopg.Connection]: + return partial(psycopg.connect, self.__dsn) + + def ping(self, conn: Connection | None = None): + try: + if conn is not None: + conn.execute("") + else: + with self.connection() as conn: + conn.execute("") + except psycopg.errors.DatabaseError as e: + raise ConnectionError(f"Ping to {self.name} failed. ({self.info})") from e + + +class ConfigDB(Database): + name = "configDB" + + def get_parser(self, thing_uuid) -> parsing.FileParser: + """Returns parser-type-name and parser-parameter""" + query = ( + "select fpt.name, fp.params from thing t " + "join s3_store s3 on t.s3_store_id = s3.id " + "join file_parser fp on s3.file_parser_id = fp.id " + "join file_parser_type fpt on fp.file_parser_type_id = fpt.id " + "where t.uuid = %s" + ) + with self.connection() as conn: + p_type, p_params = conn.execute(query, [thing_uuid]).fetchone() # noqa + return parsing.get_parser(p_type, p_params) + + def get_mqtt_parser(self, thing_uuid) -> parsing.MqttDataParser: + query = ( + "select mdt.name from thing t join mqtt m on t.mqtt_id = m.id " + "join mqtt_device_type mdt on m.mqtt_device_type_id = mdt.id " + "where t.uuid = %s" + ) + with self.connection() as conn: + dev_type = conn.execute(query, [thing_uuid]).fetchone() # noqa + + return parsing.get_parser(dev_type, None) + + def get_thing_uuid(self, by: Literal["bucket", "mqtt_user"], value) -> str | None: + # fmt: off + by_map = { + "bucket": "select t.uuid from thing t join s3_store s3 on " + "t.s3_store_id = s3.id where s3.bucket = %s", + "mqtt_user": 'select t.uuid from mqtt m join thing t on ' + 'm.id = t.mqtt_id where m."user" = %s', + } + # fmt: on + logging.debug(f"get thing uuid for {by}={value}") + if query := by_map.get(by): + with self.connection() as conn: + res = conn.execute(query, [value]).fetchone() + if res is None: + raise DataNotFoundError(f"No thing for {by}: {value}") + uuid = res[0] + logging.debug(f"got thing {uuid}") + return uuid + raise ValueError("Argument 'by' must be one of 'bucket' or 'mqtt_user'") + + def get_s3_store(self, thing_uuid): + query = ( + "select s3s.* from config_db.s3_store s3s join " + "thing t on s3s.id = t.s3_store_id where t.uuid = %s" + ) + with self.connection() as conn: + with conn.cursor(row_factory=dict_row) as cur: + return cur.execute(query, [thing_uuid]).fetchone() + + +class DBapi: + + def __init__(self, base_url): + self.base_url = base_url + self.ping_dbapi() + + def ping_dbapi(self): + """ + Test the health endpoint of the given url. + + Added in version 0.4.0 + """ + with urllib.request.urlopen(f"{self.base_url}/health") as resp: + if not resp.status == 200: + raise ConnectionError( + f"Failed to ping. HTTP status code: {resp.status}" + ) + + def upsert_observations(self, thing_uuid: str, observations: list[dict[str, Any]]): + url = f"{self.base_url}/observations/upsert/{thing_uuid}" + response = requests.post(url, json={"observations": observations}) + if response.status_code not in (200, 201): + raise RuntimeError( + f"upload to {thing_uuid} failed with " + f"{response.reason} and {response.text}" + ) + + +class ReentrantConnection: + """ + Workaround for stale connections. + Stale connections might happen for different reasons, for example, when + a timeout occur, because the connection was not used for some time or + the database service restarted. + """ + + # in seconds + TIMEOUT = 2.0 + logger = logging.getLogger("ReentrantConnection") + + def __init__( + self, dsn=None, connection_factory=None, cursor_factory=None, **kwargs + ): + + # we use a nested function to hide credentials + def _connect(_self) -> None: + _self._conn = psycopg2.connect( + dsn, connection_factory, cursor_factory, **kwargs + ) + + self._conn: psycopg2.extensions.connection | None = None + self._connect = _connect + self._lock = threading.RLock() + + def _is_alive(self) -> bool: + try: + self._ping() + except TimeoutError: + self.logger.debug("Connection timed out") + return False + except (psycopg2.InterfaceError, psycopg2.OperationalError): + self.logger.debug("Connection seems stale") + return False + else: + return True + + def _ping(self): + if self._conn is None: + raise ValueError("must call connect first") + with self._conn as conn: + # unfortunately there is no client side timeout + # option, and we encountered spurious very long + # Connection timeouts (>15 min) + timer = threading.Timer(self.TIMEOUT, conn.cancel) + timer.start() + try: + # also unfortunately there is no other way to check + # if the db connection is still alive, other than to + # send a (simple) query. + with conn.cursor() as c: + c.execute("select 1") + c.fetchone() + if timer.is_alive(): + return + finally: + try: + timer.cancel() + except Exception: + pass + raise TimeoutError("Connection timed out") + + def reconnect(self) -> psycopg2.extensions.connection: + with self._lock: + if self._conn is None or not self._is_alive(): + try: + self._conn.close() # noqa + except Exception: + pass + self.logger.debug("(re)connecting to database") + self._connect(self) + self._ping() + return self._conn + + connect = reconnect + + def close(self) -> None: + self._conn.close() diff --git a/src/timeio/db_setup.py b/src/timeio/db_setup.py new file mode 100644 index 00000000..02868ed4 --- /dev/null +++ b/src/timeio/db_setup.py @@ -0,0 +1,396 @@ +from __future__ import annotations + +import json +import logging +import os + +from psycopg2 import sql + +from base_handler import AbstractHandler, MQTTMessage +from databases import ReentrantConnection +from thing import Thing +from utils import get_envvar, setup_logging +from utils.journaling import Journal +from utils.crypto import decrypt, get_crypt_key +import psycopg + +logger = logging.getLogger("db-setup") +journal = Journal("System") + + +class CreateThingInPostgresHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.db_conn = ReentrantConnection(get_envvar("DATABASE_URL")) + self.db = self.db_conn.connect() + + def act(self, content: dict, message: MQTTMessage): + self.db = self.db_conn.reconnect() + thing = Thing.get_instance(content) + logger.info(f"start processing. {thing.name=}, {thing.uuid=}") + STA_PREFIX = "sta_" + GRF_PREFIX = "grf_" + + # 1. Check, if there is already a database user for this project + if not self.user_exists(user := thing.database.username.lower()): + logger.debug(f"create user {user}") + self.create_user(thing) + logger.debug("create schema") + self.create_schema(thing) + + if not self.user_exists( + sta_user := STA_PREFIX + thing.database.ro_username.lower() + ): + logger.debug(f"create sta read-only user {sta_user}") + self.create_ro_user(thing, user_prefix=STA_PREFIX) + + if not self.user_exists( + grf_user := GRF_PREFIX + thing.database.ro_username.lower() + ): + logger.debug(f"create grafana read-only user {grf_user}") + self.create_ro_user(thing, user_prefix=GRF_PREFIX) + + logger.debug("deploy dll") + self.deploy_ddl(thing) + logger.debug("deploy dml") + self.deploy_dml() + + logger.info("update/create thing in db") + created = self.upsert_thing(thing) + journal.info(f"{'Created' if created else 'Updated'} Thing", thing.uuid) + + logger.debug("create frost views") + self.create_frost_views(thing, user_prefix=STA_PREFIX) + logger.debug(f"grand frost view privileges to {sta_user}") + self.grant_sta_select(thing, user_prefix=STA_PREFIX) + logger.debug("create grafana helper views") + self.create_grafana_helper_view(thing) + logger.debug(f"grand grafana view privileges to {grf_user}") + self.grant_grafana_select(thing, user_prefix=GRF_PREFIX) + + def create_user(self, thing): + + with self.db: + with self.db.cursor() as c: + user = sql.Identifier(thing.database.username.lower()) + passw = decrypt(thing.database.password, get_crypt_key()) + c.execute( + sql.SQL("CREATE ROLE {user} WITH LOGIN PASSWORD {password}").format( + user=user, password=sql.Literal(passw) + ) + ) + c.execute( + sql.SQL("GRANT {user} TO {creator}").format( + user=user, creator=sql.Identifier(self.db.info.user) + ) + ) + + def create_ro_user(self, thing, user_prefix: str = ""): + with self.db: + with self.db.cursor() as c: + ro_username = user_prefix.lower() + thing.database.ro_username.lower() + ro_user = sql.Identifier(ro_username) + schema = sql.Identifier(thing.database.username.lower()) + ro_passw = decrypt(thing.database.ro_password, get_crypt_key()) + + c.execute( + sql.SQL( + "CREATE ROLE {ro_user} WITH LOGIN PASSWORD {ro_password}" + ).format(ro_user=ro_user, ro_password=sql.Literal(ro_passw)) + ) + + c.execute( + sql.SQL("GRANT {ro_user} TO {creator}").format( + ro_user=ro_user, creator=sql.Identifier(self.db.info.user) + ) + ) + + # Allow tcp connections to database with new user + c.execute( + sql.SQL("GRANT CONNECT ON DATABASE {db_name} TO {ro_user}").format( + ro_user=ro_user, db_name=sql.Identifier(self.db.info.dbname) + ) + ) + + c.execute( + sql.SQL("GRANT USAGE ON SCHEMA {schema} TO {ro_user}").format( + ro_user=ro_user, schema=schema + ) + ) + + def password_has_changed(self, url, user, password): + # NOTE: currently unused function + try: + with psycopg.connect(url, user=user, password=password): + pass + except psycopg.OperationalError as e: + if "password authentication failed" in str(e): + return True + raise e + else: + return False + + def maybe_update_password(self, user, password, db_url): + # NOTE: currently unused function + password = decrypt(password, get_crypt_key()) + if not self.password_has_changed(user, password, db_url): + return + + logger.debug(f"update password for user {user}") + with self.db: + with self.db.cursor() as c: + c.execute( + sql.SQL("ALTER USER {user} WITH PASSWORD {password}").format( + user=sql.Identifier(user), password=sql.Identifier(password) + ) + ) + + def create_schema(self, thing): + with self.db: + with self.db.cursor() as c: + c.execute( + sql.SQL( + "CREATE SCHEMA IF NOT EXISTS {user} AUTHORIZATION {user}" + ).format(user=sql.Identifier(thing.database.username.lower())) + ) + + def deploy_ddl(self, thing): + file = os.path.join( + os.path.dirname(__file__), + "CreateThingInDatabaseAction", + "postgres-ddl.sql", + ) + with open(file) as fh: + query = fh.read() + + with self.db: + with self.db.cursor() as c: + user = sql.Identifier(thing.database.username.lower()) + # Set search path for current session + c.execute(sql.SQL("SET search_path TO {0}").format(user)) + # Allow tcp connections to database with new user + c.execute( + sql.SQL("GRANT CONNECT ON DATABASE {db_name} TO {user}").format( + user=user, db_name=sql.Identifier(self.db.info.dbname) + ) + ) + # Set default schema when connecting as user + c.execute( + sql.SQL( + "ALTER ROLE {user} SET search_path to {user}, public" + ).format(user=user) + ) + # Grant schema to new user + c.execute( + sql.SQL("GRANT USAGE ON SCHEMA {user}, public TO {user}").format( + user=user + ) + ) + # Equip new user with all grants + c.execute( + sql.SQL("GRANT ALL ON SCHEMA {user} TO {user}").format(user=user) + ) + # deploy the tables and indices and so on + c.execute(query) + + c.execute( + sql.SQL( + "GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA {user} TO {user}" + ).format(user=user) + ) + + c.execute( + sql.SQL( + "GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA {user} TO {user}" + ).format(user=user) + ) + + def deploy_dml(self): + file = os.path.join( + os.path.dirname(__file__), + "CreateThingInDatabaseAction", + "postgres-dml.sql", + ) + with open(file) as fh: + query = fh.read() + with self.db: + with self.db.cursor() as c: + c.execute(query) + + def grant_sta_select(self, thing, user_prefix: str): + schema = sql.Identifier(thing.database.username.lower()) + sta_user = sql.Identifier( + user_prefix.lower() + thing.database.ro_username.lower() + ) + with self.db: + with self.db.cursor() as c: + # Set default schema when connecting as user + c.execute( + sql.SQL( + "ALTER ROLE {sta_user} SET search_path to {schema}, public" + ).format(sta_user=sta_user, schema=schema) + ) + + # grant read rights to newly created views in schema to user + c.execute( + sql.SQL( + "GRANT SELECT ON ALL TABLES in SCHEMA {schema} TO {sta_user}" + ).format(sta_user=sta_user, schema=schema) + ) + + c.execute( + sql.SQL( + "GRANT SELECT ON ALL SEQUENCES in SCHEMA {schema} TO {sta_user}" + ).format(sta_user=sta_user, schema=schema) + ) + + c.execute( + sql.SQL( + "GRANT EXECUTE ON ALL FUNCTIONS in SCHEMA {schema} TO {sta_user}" + ).format(sta_user=sta_user, schema=schema) + ) + + def grant_grafana_select(self, thing, user_prefix: str): + with self.db: + with self.db.cursor() as c: + schema = sql.Identifier(thing.database.username.lower()) + grf_user = sql.Identifier( + user_prefix.lower() + thing.database.ro_username.lower() + ) + + # Set default schema when connecting as user + c.execute( + sql.SQL("ALTER ROLE {grf_user} SET search_path to {schema}").format( + grf_user=grf_user, schema=schema + ) + ) + + c.execute(sql.SQL("SET search_path TO {schema}").format(schema=schema)) + + c.execute( + sql.SQL( + "REVOKE ALL ON ALL TABLES IN SCHEMA {schema}, public FROM {grf_user}" + ).format(grf_user=grf_user, schema=schema) + ) + + c.execute( + sql.SQL( + "GRANT SELECT ON TABLE thing, datastream, observation, " + "journal, datastream_properties TO {grf_user}" + ).format(grf_user=grf_user, schema=schema) + ) + + def create_frost_views(self, thing, user_prefix: str = "sta_"): + base_path = os.path.join( + os.path.dirname(__file__), "CreateThingInDatabaseAction", "sta" + ) + files = [ + os.path.join(base_path, "schema_context.sql"), + os.path.join(base_path, "thing.sql"), + os.path.join(base_path, "location.sql"), + os.path.join(base_path, "sensor.sql"), + os.path.join(base_path, "observed_property.sql"), + os.path.join(base_path, "datastream.sql"), + os.path.join(base_path, "observation.sql"), + os.path.join(base_path, "feature.sql"), + ] + with self.db: + with self.db.cursor() as c: + for file in files: + logger.debug(f"deploy file: {file}") + with open(file) as fh: + view = fh.read() + user = sql.Identifier(thing.database.username.lower()) + sta_user = sql.Identifier( + user_prefix.lower() + thing.database.ro_username.lower() + ) + c.execute(sql.SQL("SET search_path TO {user}").format(user=user)) + c.execute( + view, + { + "cv_url": os.environ.get("CV_URL"), + "sms_url": os.environ.get("SMS_URL"), + "tsm_schema": thing.database.username.lower(), + }, + ) + + def create_grafana_helper_view(self, thing): + with self.db: + with self.db.cursor() as c: + username_identifier = sql.Identifier(thing.database.username.lower()) + # Set search path for current session + c.execute(sql.SQL("SET search_path TO {0}").format(username_identifier)) + c.execute( + sql.SQL("""DROP VIEW IF EXISTS "datastream_properties" CASCADE""") + ) + c.execute( + sql.SQL( + """ + CREATE OR REPLACE VIEW "datastream_properties" AS + SELECT DISTINCT case + when dp.property_name is null or dp.unit_name is null then tsm_ds.position + else concat(dp.property_name, ' (', dp.unit_name, ') - ', tsm_ds."position"::text) + end as "property", + tsm_ds."position", + tsm_ds.id as "ds_id", + tsm_t.uuid as "t_uuid" + FROM datastream tsm_ds + JOIN thing tsm_t ON tsm_ds.thing_id = tsm_t.id + LEFT JOIN public.sms_datastream_link sdl ON tsm_t.uuid = sdl.thing_id AND tsm_ds.id = sdl.datastream_id + LEFT JOIN public.sms_device_property dp ON sdl.device_property_id = dp.id + """ + ) + ) + + def upsert_thing(self, thing) -> bool: + """Returns True for insert and False for update""" + query = ( + "INSERT INTO thing (name, uuid, description, properties) " + "VALUES (%s, %s, %s, %s) ON CONFLICT (uuid) DO UPDATE SET " + "name = EXCLUDED.name, " + "description = EXCLUDED.description, " + "properties = EXCLUDED.properties " + "RETURNING (xmax = 0)" + ) + with self.db: + with self.db.cursor() as c: + c.execute( + sql.SQL("SET search_path TO {user}").format( + user=sql.Identifier(thing.database.username.lower()) + ) + ) + c.execute( + query, + ( + thing.name, + thing.uuid, + thing.description, + json.dumps(thing.properties), + ), + ) + return c.fetchone()[0] + + def thing_exists(self, username: str): + with self.db: + with self.db.cursor() as c: + c.execute("SELECT 1 FROM pg_roles WHERE rolname=%s", [username]) + return len(c.fetchall()) > 0 + + def user_exists(self, username: str): + with self.db: + with self.db.cursor() as c: + c.execute("SELECT 1 FROM pg_roles WHERE rolname=%s", [username]) + return len(c.fetchall()) > 0 + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateThingInPostgresHandler().run_loop() diff --git a/src/timeio/file_ingest.py b/src/timeio/file_ingest.py new file mode 100644 index 00000000..de150cc7 --- /dev/null +++ b/src/timeio/file_ingest.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import fnmatch +import json +import logging +from datetime import datetime +import warnings + +from minio import Minio +from minio.commonconfig import Tags + +import databases +from base_handler import AbstractHandler, MQTTMessage +from utils import get_envvar, setup_logging +from utils.errors import UserInputError, ParsingError +from utils.journaling import Journal + +_FILE_MAX_SIZE = 256 * 1024 * 1024 + +logger = logging.getLogger("file-ingest") +journal = Journal("Parser") + + +class ParserJobHandler(AbstractHandler): + + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.minio = Minio( + endpoint=get_envvar("MINIO_URL"), + access_key=get_envvar("MINIO_ACCESS_KEY"), + secret_key=get_envvar("MINIO_SECURE_KEY"), + secure=get_envvar("MINIO_SECURE", default=True, cast_to=bool), + ) + self.pub_topic = get_envvar("TOPIC_DATA_PARSED") + self.dbapi = databases.DBapi(get_envvar("DB_API_BASE_URL")) + self.confdb = databases.ConfigDB(get_envvar("CONFIGDB_DSN")) + + def act(self, content: dict, message: MQTTMessage): + + if not self.is_valid_event(content): + logger.debug(f'irrelevant event {content["EventName"]!r}') + return + + # Directories are part of the filename + # eg: foo/bar/file.ext -> bucket: foo, file: bar/file.ext + bucket_name, filename = content["Key"].split("/", maxsplit=1) + thing_uuid = self.confdb.get_thing_uuid("bucket", bucket_name) + pattern = self.confdb.get_s3_store(thing_uuid)["filename_pattern"] + + if not fnmatch.fnmatch(filename, pattern): + logger.debug(f"{filename} is excluded by filename_pattern {pattern!r}") + return + + source_uri = f"{bucket_name}/{filename}" + + logger.debug(f"loading parser for {thing_uuid}") + parser = self.confdb.get_parser(thing_uuid) + + logger.debug(f"reading raw data file {source_uri}") + rawdata = self.read_file(bucket_name, filename) + + logger.info(f"parsing rawdata ... ") + file = source_uri + with warnings.catch_warnings() as w: + try: + df = parser.do_parse(rawdata) + obs = parser.to_observations(df, source_uri) + except ParsingError as e: + journal.error( + f"Parsing failed. Detail: {e}. File: {file!r}", thing_uuid + ) + raise e + except Exception as e: + journal.error(f"Parsing failed for file {file!r}", thing_uuid) + raise UserInputError("Parsing failed") from e + if w: + journal.warning(w[0].message, thing_uuid) + + logger.debug(f"storing observations to database ...") + try: + self.dbapi.upsert_observations(thing_uuid, obs) + except Exception as e: + # Tell the user that his parsing was successful + journal.error( + f"Parsing was successful, but storing data " + f"in database failed. File: {file!r}", + thing_uuid, + ) + raise e + + # Now everything is fine and we tell the user + journal.info(f"Parsed file {file}", thing_uuid) + + object_tags = Tags.new_object_tags() + object_tags["parsed_at"] = datetime.now().isoformat() + self.minio.set_object_tags(bucket_name, filename, object_tags) + payload = json.dumps({"thing_uuid": str(thing_uuid)}) + self.mqtt_client.publish( + topic=self.pub_topic, payload=payload, qos=self.mqtt_qos + ) + + def is_valid_event(self, content: dict): + logger.debug(f'{content["EventName"]=}') + return content["EventName"] in ( + "s3:ObjectCreated:Put", + "s3:ObjectCreated:CompleteMultipartUpload", + ) + + def read_file(self, bucket_name, object_name) -> str: + stat = self.minio.stat_object(bucket_name, object_name) + if stat.size > _FILE_MAX_SIZE: + raise IOError("Maximum filesize of 256M exceeded") + rawdata = ( + self.minio.get_object(bucket_name, object_name) + .read() + .decode() + # remove the ASCII control character ETX (end-of-text) + .rstrip("\x03") + ) + return rawdata + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + ParserJobHandler().run_loop() diff --git a/src/timeio/frost.py b/src/timeio/frost.py new file mode 100644 index 00000000..3907b530 --- /dev/null +++ b/src/timeio/frost.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import logging +import pathlib +import xml.etree.ElementTree as ET # noqa +from urllib.parse import urlparse, urlunparse +from utils.crypto import decrypt, get_crypt_key + +logger = logging.getLogger(__name__) + +_TEMPLATE = """\ +<?xml version="1.0" encoding="UTF-8"?> +<Context path="/{schema}" docBase="/share/FROST-Server.war"> + <Parameter override="false" name="ApiVersion" value="v1.0" description="The version tag of the API used in the URL."/> + <Parameter override="false" name="serviceRootUrl" value="{tomcat_proxy_url}{schema}" description="The base URL of the SensorThings Server without version."/> + <Parameter override="false" name="defaultCount" value="false" description="The default value for the $count query option."/> + <Parameter override="false" name="defaultTop" value="100" description="The default value for the $top query option."/> + <Parameter override="false" name="maxTop" value="1000" description="The maximum allowed value for the $top query option."/> + <Parameter override="false" name="maxDataSize" value="25000000" description="The maximum allowed estimated data size (in bytes) for responses."/> + <Parameter override="false" name="bus.busImplementationClass" value="de.fraunhofer.iosb.ilt.frostserver.messagebus.InternalMessageBus" description="The java class used for connecting to the message bus."/> + <Parameter override="false" name="persistence.persistenceManagerImplementationClass" value="de.fraunhofer.iosb.ilt.frostserver.persistence.pgjooq.imp.PostgresPersistenceManagerLong" description="The java class used for persistence (must implement PersistenceManaher interface)"/> + <Parameter override="false" name="persistence.idGenerationMode" value="ServerGeneratedOnly" description="Mode for id generation when using PostgresPersistenceManagerString."/> + <Parameter override="false" name="persistence.autoUpdateDatabase" value="false" description="Automatically apply database updates."/> + <Parameter override="false" name="persistence.alwaysOrderbyId" value="false" description="Always add an 'orderby=id asc' to queries to ensure consistent paging."/> + <Parameter override="false" name="persistence.db_jndi_datasource" value="jdbc/sensorThings" description="JNDI data source name"/> + <Resource + name="jdbc/sensorThings" auth="Container" + type="javax.sql.DataSource" driverClassName="org.postgresql.Driver" + url="jdbc:{db_url}" + username="{username}" password="{password}" + maxTotal="20" maxIdle="10" maxWaitMillis="-1" + defaultAutoCommit="false" + /> +</Context> +""" +CONTEXT_FILES_DIR = ( + pathlib.Path(__file__) + .resolve() + .parent.joinpath("CreateNewFrostInstanceAction/tomcat/context_files") +) + + +def write_context_file(schema, user, password, db_url, tomcat_proxy_url) -> None: + parts = urlparse(db_url) + hostname = parts.hostname + if parts.port: + hostname += f":{parts.port}" + jdbc_url = urlunparse((parts.scheme, hostname, parts.path, "", "", "")) + content = _TEMPLATE.format( + db_url=jdbc_url, + schema=schema, + username=user, + password=decrypt(password, get_crypt_key()), + tomcat_proxy_url=tomcat_proxy_url, + ).strip() + + path = f"{CONTEXT_FILES_DIR}/{schema}.xml" + logger.debug(f"write tomcat context file {path!r}") + with open(path, "wb") as fh: + fh.write(ET.tostring(ET.XML(content))) diff --git a/src/timeio/frost_setup.py b/src/timeio/frost_setup.py new file mode 100644 index 00000000..73eb310d --- /dev/null +++ b/src/timeio/frost_setup.py @@ -0,0 +1,31 @@ +import logging + +from base_handler import AbstractHandler, MQTTMessage +from thing import Thing +from utils import get_envvar, setup_logging + +logger = logging.getLogger("frost-setup") + + +class CreateFrostInstanceHandler(AbstractHandler): + + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.tomcat_proxy_url = get_envvar("TOMCAT_PROXY_URL") + + def act(self, content: dict, message: MQTTMessage): + thing = Thing.get_instance(content) + thing.setup_frost(self.tomcat_proxy_url) + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateFrostInstanceHandler().run_loop() diff --git a/src/timeio/grafana_dashboard_setup.py b/src/timeio/grafana_dashboard_setup.py new file mode 100644 index 00000000..4edda7cc --- /dev/null +++ b/src/timeio/grafana_dashboard_setup.py @@ -0,0 +1,483 @@ +import logging +from urllib.parse import urlparse + +from grafana_client import GrafanaApi +from grafana_client.client import GrafanaException + +from base_handler import AbstractHandler, MQTTMessage +from thing import Thing +from utils import get_envvar, setup_logging +from utils.crypto import decrypt, get_crypt_key + +logger = logging.getLogger("grafana-dashboard-setup") + + +class CreateThingInGrafanaHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + + self.api = GrafanaApi.from_url( + url=get_envvar("GRAFANA_URL"), + credential=( + get_envvar("GRAFANA_USER"), + get_envvar("GRAFANA_PASSWORD"), + ), + ) + # needed when defining new datasource + self.sslmode = get_envvar("GRAFANA_DEFAULT_DATASOURCE_SSLMODE") + + def act(self, content: dict, message: MQTTMessage): + thing = Thing.get_instance(content) + self.create_organization(thing) + + # create datasource, folder, dashboard in project org + # Give Grafana and Org Admins admin access to folder + # Give Role Editor edit access to folder + org_id = self.organization_from_list(thing.project.name).get("id") + self.create_all_in_org(thing, org_id=org_id, role=2) + + # create team, datasource, folder, dashboard in Main org + # Give Team viewer access to folder + self.create_all_in_org(thing, org_id=1, role=1) + + def create_all_in_org(self, thing, org_id, role): + self.api.organizations.switch_organization(org_id) + self.create_datasource(thing, user_prefix="grf_") + self.create_folder(thing) + # only create team in Main org + if org_id == 1: + self.create_team(thing, org_id) + self.set_folder_permissions(thing, role) + self.create_dashboard(thing) + + def create_organization(self, thing): + name = thing.project.name + if not self.organization_exists(name): + self.api.organization.create_organization({"name": name}) + logger.debug(f"Created organization {name}") + else: + logger.debug(f"Organization {name} already exists") + + def create_team(self, thing, org_id): + name = thing.project.name + if not self.team_exists(name): + self.api.teams.add_team({"name": name, "orgId": org_id}) + logger.debug(f"Created team {name}") + else: + logger.debug(f"Team {name} already exists") + + def create_folder(self, thing): + uid = thing.project.uuid + name = thing.project.name + if not self.folder_exists(uid): + self.api.folder.create_folder(name, uid) + logger.debug(f"Created folder {name}") + else: + logger.debug(f"Folder {name} already exists") + + def create_datasource(self, thing, user_prefix: str): + uuid, name = thing.project.uuid, thing.project.name + if not self.datasource_exists(uuid): + datasource = self.new_datasource(thing, user_prefix) + self.api.datasource.create_datasource(datasource) + logger.debug(f"Created datasource {name}") + else: + logger.debug(f"Datasource {name} already exists") + + def create_dashboard(self, thing, overwrite=True): + # create/update dashboard if it doesn't exist or overwrite is True + if overwrite or not self.dashboard_exists(thing.uuid): + dashboard = self.build_dashboard_dict(thing) + self.api.dashboard.update_dashboard(dashboard) + action = "Updated" if overwrite else "Created" + logger.debug(f"{action} dashboard {thing.name}") + else: + logger.debug(f"Dashboard {thing.name} already exists") + + def new_datasource(self, thing, user_prefix: str): + ds_uid = thing.project.uuid + ds_name = thing.project.name + db_user = user_prefix.lower() + thing.database.ro_username.lower() + db_password = decrypt(thing.database.ro_password, get_crypt_key()) + + # parse thing.database.url to get hostname, port, database name + db_url_parsed = urlparse(thing.database.url) + db_path = db_url_parsed.path.lstrip("/") + # only add port, if it is defined + db_url = db_url_parsed.hostname + if db_url_parsed.port is not None: + db_url += f":{db_url_parsed.port}" + + return { + "name": ds_name, + "uid": ds_uid, + "type": "postgres", + "url": db_url, + "user": db_user, + "access": "proxy", + "basicAuth": False, + "jsonData": { + "database": db_path, + "sslmode": self.sslmode, + "timescaledb": True, + }, + "secureJsonData": {"password": db_password}, + } + + def organization_from_list(self, name): + organizations = self.api.organizations.list_organization() + for org in organizations: + if org.get("name") == name: + return org + return None + + def _exists(self, func: callable, *args) -> bool: + try: + func(*args) + except GrafanaException: + return False + else: + return True + + def organization_exists(self, name) -> bool: + return self.organization_from_list(name) is not None + + def datasource_exists(self, uuid) -> bool: + return self._exists(self.api.datasource.get_datasource_by_uid, uuid) + + def dashboard_exists(self, uuid) -> bool: + return self._exists(self.api.dashboard.get_dashboard, uuid) + + def folder_exists(self, uuid) -> bool: + return self._exists(self.api.folder.get_folder, uuid) + + def team_exists(self, name) -> bool: + return bool(self.api.teams.search_teams(query=name)) + + def set_folder_permissions(self, thing, role): + name, uuid = thing.project.name, thing.project.uuid + current_org = self.api.organization.get_current_organization() + if role == 1: + team_id = self.api.teams.search_teams(query=name)[0].get("id") + # set GRAFANA_USER as folder admin and team as Viewer + permissions = { + "items": [ + {"userId": 1, "permission": 4}, + {"teamId": team_id, "permission": role}, + ] + } + else: + logger.debug(f"Set folder permissions for role {role}") + # allow role Editor to edit folder + permissions = { + "items": [ + {"userId": 1, "permission": 4}, + {"role": "Admin", "permission": 4}, + {"role": "Editor", "permission": role}, + ] + } + self.api.folder.update_folder_permissions(uuid, permissions) + + def build_dashboard_dict(self, thing): + dashboard_uid = thing.uuid + dashboard_title = thing.name + folder_uid = thing.project.uuid + folder_title = thing.project.name + datasource_dict = {"type": "postgres", "uid": folder_uid} + + # template variable for datastream positions/properties + datastream_sql = f""" + SELECT property FROM datastream_properties + WHERE t_uuid::text = '{thing.uuid}' + """ + datastream_templating = { + "datasource": datasource_dict, + "hide": 0, + "includeAll": True, + "label": "Datastream", + "multi": True, + "name": "datastream_pos", + "query": datastream_sql, + "refresh": 1, + "sort": 7, + "type": "query", + } + + # template variable for log levels + log_level_templating = { + "datasource": datasource_dict, + "hide": 0, + "includeAll": True, + "label": "Log Level", + "multi": True, + "name": "log_level", + "query": "INFO,WARNING,ERROR", + "refresh": 1, + "sort": 7, + "type": "custom", + } + + show_qaqc_templating = { + "datasource": datasource_dict, + "hide": 0, + "type": "custom", + "name": "show_qaqc_flags", + "label": "Show QAQC Flags", + "query": "False,True", + "multi": False, + "includeAll": False, + "options": [ + {"text": "False", "value": "False", "selected": True}, + {"text": "True", "value": "True", "selected": False}, + ], + } + + # value mapping and overrides for log levels in journal panel + log_level_mapping = { + "options": { + "ERROR": {"color": "#9d545d", "index": 2}, + "INFO": {"color": "#6d9967", "index": 0}, + "WARNING": {"color": "#b48250", "index": 1}, + }, + "type": "value", + } + + log_level_overrides = { + "matcher": {"id": "byName", "options": "level"}, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "applyToRow": True, + "mode": "gradient", + "type": "color-background", + }, + } + ], + } + + # template variable for log origins + log_origin_sql = f""" + SELECT DISTINCT origin FROM journal j + JOIN thing t on j.thing_id = t.id + WHERE t.uuid::text = '{thing.uuid}' + """ + log_origin_templating = { + "datasource": datasource_dict, + "hide": 0, + "includeAll": True, + "label": "Log Origin", + "multi": True, + "name": "log_origin", + "query": log_origin_sql, + "refresh": 1, + "sort": 7, + "type": "query", + } + + # query to get observations, used in timeseries panel + observation_sql = f""" + WITH date_filtered AS ( + -- This query returns the data chosen by the datepicker, or + -- returns null if no data is in the selected date range. + SELECT + o.result_time AS "time", + o.result_number AS "value" + FROM observation o + WHERE $__timeFilter(o.result_time) + AND o.datastream_id = ( + SELECT dp.ds_id FROM datastream_properties dp + WHERE ${{datastream_pos:singlequote}} in (dp.property, dp.position) + AND dp.t_uuid :: text = '{thing.uuid}') + ORDER BY o.result_time DESC + LIMIT 1000000 -- 1M + ), + fallback AS ( + -- This query returns the most recent 10k datapoints + SELECT + o.result_time AS "time", + o.result_number AS "value" + FROM observation o + WHERE o.datastream_id = ( + SELECT dp.ds_id FROM datastream_properties dp + WHERE ${{datastream_pos:singlequote}} in (dp.property, dp.position) + AND dp.t_uuid :: text = '{thing.uuid}') + ORDER BY o.result_time DESC -- most recent + LIMIT 10000 -- 10k + ) + -- First the date_filtered query is executed. If it returns + -- null, because the user selected a time range without any + -- data, the fallback query is executed and return the most + -- recent 10k data points. This fallback data is not shown + -- immediately, because it is also not the selected timerange, + -- but grafana will now show a ZoomToData button. If the user + -- press the button, the panel will jump to the data from the + -- fallback query (the most recent 10k data points). + SELECT * FROM date_filtered + UNION ALL + SELECT * FROM fallback + WHERE NOT EXISTS (SELECT 1 FROM date_filtered) + ORDER BY "time" ASC + """ + qaqc_sql = f""" + SELECT o.result_time AS "time", + 1 AS "quality_flag", + jsonb_build_object( + 'annotation', CAST (jsonb_extract_path_text(result_quality, 'annotation') AS DECIMAL), + 'measure', jsonb_extract_path_text(result_quality, 'properties', 'measure'), + 'user_label', jsonb_extract_path_text(result_quality, 'properties', 'userLabel') + ) AS "qaqc_result" + FROM observation o + WHERE o.datastream_id = ( + SELECT dp.ds_id + FROM datastream_properties dp + WHERE ${{datastream_pos:singlequote}} in (dp.property,dp.position) + AND dp.t_uuid::text = '{thing.uuid}' + ) AND ${{show_qaqc_flags}} = 'True' + AND result_quality IS NOT NULL + AND result_quality <> 'null' + AND jsonb_extract_path_text(result_quality, 'annotation') <> '0.0' + AND jsonb_extract_path_text(result_quality, 'annotation') <> '-inf' + ORDER BY o.result_time ASC + """ + show_qaqc_overrides = { + "matcher": { + "id": "byFrameRefID", + "options": "B", + }, + "properties": [ + {"id": "custom.drawStyle", "value": "points"}, + {"id": "custom.axisPlacement", "value": "hidden"}, + {"id": "custom.axisSoftMax", "value": 1}, + {"id": "custom.pointSize", "value": 7}, + ], + } + # build observations panel dict + observation_panel = { + "datasource": datasource_dict, + "gridPos": {"h": 8}, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": False, + } + }, + "maxPerRow": 3, + "repeat": "datastream_pos", + "repeatDirection": "h", + "fieldConfig": { + "overrides": [show_qaqc_overrides], + }, + "targets": [ + { + "datasource": datasource_dict, + "editorMode": "code", + "format": "time_series", + "rawQuery": True, + "rawSql": observation_sql, + "refId": "A", + }, + { + "datasource": datasource_dict, + "editorMode": "code", + "format": "time_series", + "rawQuery": True, + "rawSql": qaqc_sql, + "refId": "B", + }, + ], + "title": "$datastream_pos", + "type": "timeseries", + } + observation_row = { + "collapsed": False, + "gridPos": {"h": 1, "w": 24}, + "panels": [], + "title": "Observations", + "type": "row", + } + + # query to get journal messages + journal_sql = f""" + SELECT timestamp, level, message, origin FROM journal + JOIN thing t on journal.thing_id = t.id + WHERE t.uuid::text = '{thing.uuid}' + AND level in ($log_level) + AND origin in ($log_origin) + ORDER BY timestamp DESC + """ + + # build journal panel dict + journal_panel = { + "datasource": datasource_dict, + "fieldConfig": { + "defaults": {"mappings": [log_level_mapping]}, + "overrides": [log_level_overrides], + }, + "gridPos": {"h": 8, "w": 12}, + "targets": [ + { + "datasource": datasource_dict, + "editorMode": "code", + "format": "table", + "rawQuery": True, + "rawSql": journal_sql, + "refId": "A", + } + ], + "title": "Status Journal", + "type": "table", + } + journal_row = { + "collapsed": True, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "panels": [journal_panel], + "title": "Status Journal", + "type": "row", + } + + # build dashboard dictionary + dashboard = { + "editable": True, + "liveNow": True, + "panels": [ + journal_row, + observation_row, + observation_panel, + ], + "refresh": False, + "tags": [folder_title, dashboard_title, "TSM_automation"], + "templating": { + "list": [ + datastream_templating, + log_level_templating, + log_origin_templating, + show_qaqc_templating, + ] + }, + "time": {"from": "now-7d", "to": "now"}, + "title": dashboard_title, + "uid": dashboard_uid, + } + + return { + "dashboard": dashboard, + "folderUid": folder_uid, + "message": "created by TSM dashboard automation", + "overwrite": True, + } + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateThingInGrafanaHandler().run_loop() diff --git a/src/timeio/grafana_user_setup.py b/src/timeio/grafana_user_setup.py new file mode 100644 index 00000000..bbe7978a --- /dev/null +++ b/src/timeio/grafana_user_setup.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +import logging +import re +import time +from typing import Literal + +import grafana_client.api +from grafana_client.client import GrafanaException + +from base_handler import AbstractHandler, MQTTMessage +from utils import get_envvar, setup_logging +from utils.journaling import Journal + +logger = logging.getLogger("grafana-user-setup") + + +class Grafana: + + def __init__(self, url: str, user: str, password: str): + self.api = grafana_client.api.GrafanaApi.from_url( + url=url, credential=(user, password) + ) + + def get_user_id(self, name, max_tries=5, sleep=2) -> int | None: + for cnt in range(max_tries): + try: + return self.api.users.find_user(name).get("id") + except GrafanaException: + if cnt < max_tries - 1: + logger.debug(f"Fetching user {name!r} faild. Retrying...") + time.sleep(sleep) + return None + + def get_all_orgs(self) -> dict[str, int]: + orgs = self.api.organizations.list_organization() + return {org.get("name"): org.get("id") for org in orgs} + + def get_user_orgs(self, user_id) -> dict[str, str]: + orgs = self.api.users.get_user_organisations(user_id) + return {org.get("name"): org.get("role") for org in orgs} + + def update_user_orgs(self, user_id, user_name, new_orgs) -> None: + old_orgs = self.get_user_orgs(user_id) + user_add_orgs = new_orgs.keys() - old_orgs.keys() + user_update_orgs = old_orgs.keys() & new_orgs.keys() + user_delete_orgs = old_orgs.keys() - new_orgs.keys() - {"Main Org."} + + # todo: should we also really create new orgs that + # was never seen before? then we must also iterate + # over create_org_names outside this loop. + + # no, we don't want to create org names that don't yet exist in grafana + # creating the orgs should be done by the grafana_dashboard_setup.py + + all_orgs = self.get_all_orgs() + all_orgs.pop("Main Org.", None) + # We iterate over all orgs in Grafana and add/update/delete user in respective org + for org, org_id in all_orgs.items(): + self.api.organizations.switch_organization(org_id) + new_role = new_orgs.get(org) + old_role = old_orgs.get(org) + if org in user_add_orgs: + logger.debug(f"Add user to org. U=%s, O=%s", user_name, org) + self.add_user_to_org(new_role, user_name) + self.api.organizations.switch_organization(1) + self.add_user_to_team(org, user_id) + elif org in user_update_orgs: + if new_role != old_role: + logger.debug( + f"Update user. U=%s, O=%s, R=%s", user_name, org, new_role + ) + # grafana: "cannot change role for externally synced user" + # so we have to delete and re-add the user + self.remove_user_from_org(org) + self.add_user_to_org(new_role, user_name) + self.api.organizations.switch_organization(1) + if not self.user_in_team(org_id, user_id): + self.add_user_to_team(org, user_id) + elif org in user_delete_orgs: + logger.debug(f"Remove user from org. U=%s, O=%s", user_name, org) + self.remove_user_from_org(org) + self.api.organizations.switch_organization(1) + self.remove_user_from_team(org, user_id) + + def add_user_to_org(self, role, login_name) -> None: + user = {"role": role, "loginOrEmail": login_name} + self.api.organization.add_user_current_organization(user) + + def remove_user_from_org(self, user_id) -> None: + self.api.organization.delete_user_current_organization(user_id) + + def user_in_team(self, team_id, user_id) -> bool: + team_members = self.api.teams.get_team_members(team_id) + for member in team_members: + if member.get("userId") == user_id: + return True + return False + + def get_team_id_by_name(self, org) -> int | None: + team = self.api.teams.search_teams(query=org) + if team: + return self.api.teams.search_teams(query=org)[0].get("id") + return None + + def add_user_to_team(self, org, user_id) -> None: + team_id = self.get_team_id_by_name(org) + if team_id and not self.user_in_team(team_id, user_id): + self.api.teams.add_team_member(team_id, user_id) + + def remove_user_from_team(self, org, user_id) -> None: + team_id = self.get_team_id_by_name(org)[0].get("id") + if team_id and self.user_in_team(team_id, user_id): + self.api.teams.remove_team_member(team_id, user_id) + + +class CreateGrafanaUserHandler(AbstractHandler): + + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.gf = Grafana( + url=get_envvar("GRAFANA_URL"), + user=get_envvar("GRAFANA_USER"), + password=get_envvar("GRAFANA_PASSWORD"), + ) + self.vo_group_key = get_envvar("VO_GROUP_KEY", "eduperson_entitlement") + self.vo_login_key = get_envvar("VO_LOGIN_KEY", "eduperson_principal_name") + # if true, vo group admins become grafana organization admins, + # otherwise, they become editors + self.gf_roles_from_vo_roles = get_envvar( + "GF_ROLE_FROM_VO_ROLE", False, cast_to=bool + ) + self.vo_admin_subgroup = get_envvar("VO_ADMIN_SUBGROUP", "ufz-sms-admin") + self.allowed_vos = get_envvar("ALLOWED_VOS") + + def act(self, content: dict, message: MQTTMessage): + user_name = content.get(self.vo_login_key) + vo_groups = content.get(self.vo_group_key) + if user_id := self.gf.get_user_id(user_name): + logger.debug(f"Found user {user_name} with user_id {user_id}") + gf_groups = self.map_vo_groups_to_gf_orgs(vo_groups) + self.gf.update_user_orgs(user_id, user_name, gf_groups) + else: + logger.warning(f"Could not find user {user_name}. Skipping user sync.") + + def map_vo_groups_to_gf_orgs( + self, vo_groups + ) -> dict[str, Literal["Admin", "Editor"]]: + """Translate VO groups to grafana groups.""" + orgs = {} + allowed_vos = "|".join(self.allowed_vos.split(",")) + + # match.group(1): the VO (e.g. UFZ-Timeseries-Management) or None + # match.group(2): the VO group (e.g. DemoGroup) + # match.group(3): VO_ADMIN_SUBGROUP with colon (e.g. :ufz-sms-admin) or None if not present + # match.group(4): VO_ADMIN_SUBGROUP without colon (e.g. ufz-sms-admin) or None if not present + pattern = rf"({allowed_vos}):([^:#]+)(:({self.vo_admin_subgroup or ''}))?" + + for vo_group in vo_groups: + admin = org_name = role = None + if match := re.search(pattern, vo_group): + org_name = f"{match.group(1)}:{match.group(2)}" + admin = match.group(4) + role: Literal["Admin", "Editor"] = "Editor" + if admin and self.gf_roles_from_vo_roles: + role = "Admin" + orgs[org_name] = role + + logger.debug( + "vo_group: %s\nvo_admin_subgroup: %s\ngf_org: %s\ngf_role: %s\n", + *(vo_group, admin, org_name, role), + ) + return orgs + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateGrafanaUserHandler().run_loop() diff --git a/src/timeio/minio_setup.py b/src/timeio/minio_setup.py new file mode 100644 index 00000000..6288c790 --- /dev/null +++ b/src/timeio/minio_setup.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import logging + +from minio_cli_wrapper.mc import Mc + +from base_handler import AbstractHandler, MQTTMessage +from thing import Thing +from utils import get_envvar, setup_logging +from utils.crypto import decrypt, get_crypt_key + +logger = logging.getLogger("minio-setup") + + +class CreateThingInMinioHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + # Custom minio client wrapper + self.mcw = Mc( + url=get_envvar("MINIO_URL"), + access_key=get_envvar("MINIO_ACCESS_KEY"), + secret_key=get_envvar("MINIO_SECURE_KEY"), + secure=get_envvar("MINIO_SECURE", default=True, cast_to=bool), + ) + + def act(self, content: dict, message: MQTTMessage): + thing = Thing.get_instance(content) + user = thing.raw_data_storage.username + passw = decrypt(thing.raw_data_storage.password, get_crypt_key()) + bucket = thing.raw_data_storage.bucket_name + + # create user + # not implemented in minio python sdk yet :( + # so we have to use minio cli client wrapper + logger.info("create user") + self.mcw.user_add(user, passw) + + # mc admin policy add myminio/ datalogger1-policy /root/iam-policy-datalogger1.json + # not implemented in minio python sdk yet :( + self.mcw.policy_add( + user, + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetBucketLocation", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObject", + ], + "Resource": [ + f"arn:aws:s3:::{bucket}", + f"arn:aws:s3:::{bucket}/*", + ], + } + ], + }, + ) + + # mc admin policy set myminio/ datalogger1-policy user=datalogger1-user + # not implemented in minio python sdk yet :( + self.mcw.policy_set_user(user, user) + + # Create bucket + if self.mcw.bucket_exists(bucket): + logger.info(f"bucket {bucket} already exists") + else: + logger.info(f"create bucket {bucket}") + try: + self.mcw.make_locked_bucket(bucket) + except Exception as e: + raise ValueError(f'Unable to create bucket "{bucket}"') from e + + self.mcw.set_bucket_100y_retention(bucket) + self.mcw.enable_bucket_notification(bucket) + logger.info("store bucket metadata (db connection, thing uuid, etc.)") + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateThingInMinioHandler().run_loop() diff --git a/src/timeio/mqtt_ingest.py b/src/timeio/mqtt_ingest.py new file mode 100644 index 00000000..efbdbce2 --- /dev/null +++ b/src/timeio/mqtt_ingest.py @@ -0,0 +1,56 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import annotations + +import logging +import typing + +from paho.mqtt.client import MQTTMessage + +import databases +from base_handler import AbstractHandler +from utils import get_envvar, setup_logging +from utils.errors import UserInputError +from utils.journaling import Journal + +logger = logging.getLogger("mqtt-ingest") +journal = Journal("Parser") + + +class ParseMqttDataHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + + self.confdb = databases.ConfigDB(get_envvar("CONFIGDB_DSN")) + self.dbapi = databases.DBapi(get_envvar("DB_API_BASE_URL")) + + def act(self, content: typing.Any, message: MQTTMessage): + topic = message.topic + origin = f"{self.mqtt_broker}/{topic}" + logger.info(f"get thing") + mqtt_user = topic.split("/")[1] + thing_uuid = self.confdb.get_thing_uuid("mqtt_user", mqtt_user) + logger.info(f"get parser") + parser = self.confdb.get_mqtt_parser(thing_uuid) + logger.info(f"parsing rawdata") + try: + data = parser.do_parse(content, origin) + observations = parser.to_observations(data, thing_uuid) + except Exception as e: + raise UserInputError("Parsing data failed") from e + logger.info(f"store observations") + self.dbapi.upsert_observations(thing_uuid, observations) + journal.info(f"parsed mqtt data from {origin}", thing_uuid) + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + ParseMqttDataHandler().run_loop() diff --git a/src/timeio/mqtt_user_setup.py b/src/timeio/mqtt_user_setup.py new file mode 100644 index 00000000..1e3b94e3 --- /dev/null +++ b/src/timeio/mqtt_user_setup.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +import logging + +import psycopg2 + +from base_handler import AbstractHandler, MQTTMessage +from thing import Thing +from utils import get_envvar, setup_logging +from utils.journaling import Journal + +logger = logging.getLogger("mqtt-user-setup") +journal = Journal("System") + + +class CreateMqttUserHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.db = psycopg2.connect(get_envvar("DATABASE_URL")) + + def act(self, content: dict, message: MQTTMessage): + thing = Thing.get_instance(content) + if content["mqtt_authentication_credentials"]: + user = content["mqtt_authentication_credentials"]["username"] + pw = content["mqtt_authentication_credentials"]["password_hash"] + + logger.info(f"create user. {user=}") + created = self.create_user(thing, user, pw) + action = "Created" if created else "Updated" + journal.info(f"{action} MQTT user {user}", thing.uuid) + else: + logger.warning(f"no 'mqtt_authentication_credentials' present") + + def create_user(self, thing, user, pw) -> bool: + """Returns True for insert and False for update""" + sql = ( + "INSERT INTO mqtt_auth.mqtt_user (project_uuid, thing_uuid, username, " + "password, description,properties, db_schema) " + "VALUES (%s, %s, %s, %s ,%s ,%s, %s) " + "ON CONFLICT (thing_uuid) " + "DO UPDATE SET" + " project_uuid = EXCLUDED.project_uuid," + " username = EXCLUDED.username," + " password=EXCLUDED.password," + " description = EXCLUDED.description," + " properties = EXCLUDED.properties," + " db_schema = EXCLUDED.db_schema " + "RETURNING (xmax = 0)" + ) + with self.db: + with self.db.cursor() as c: + c.execute( + sql, + ( + thing.project.uuid, + thing.uuid, + user, + pw, + thing.description, + json.dumps(thing.properties), + thing.database.username, + ), + ) + return c.fetchone()[0] + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + CreateMqttUserHandler().run_loop() diff --git a/src/timeio/parsing.py b/src/timeio/parsing.py new file mode 100644 index 00000000..b631717a --- /dev/null +++ b/src/timeio/parsing.py @@ -0,0 +1,413 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import annotations + +import json +import logging +import math +import re +import warnings +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime +from io import StringIO +from typing import Any, TypedDict, TypeVar, cast + +import pandas as pd +from typing_extensions import Required + +from utils.common import ObservationResultType +from utils.errors import ParsingError, ParsingWarning +from utils.types import JsonT +from utils.journaling import Journal + +parsedT = TypeVar("parsedT") +journal = Journal("Parser") + + +def filter_lines(rawdata: str, skip: tuple | str) -> str: + if isinstance(skip, str): + skip = (skip,) + pattern = "|".join(skip) + return "\n".join(ln for ln in rawdata.splitlines() if not re.match(pattern, ln)) + + +class ObservationPayloadT(TypedDict, total=False): + phenomenon_time_start: datetime + phenomenon_time_end: datetime + result_time: Required[datetime] + result_type: Required[int] + result_number: float | int + result_string: str + result_boolean: bool + result_json: JsonT + result_latitude: float | int + result_longitude: float | int + result_altitude: float | int + result_quality: JsonT + valid_time_start: datetime + valid_time_end: datetime + parameters: JsonT + datastream_pos: Required[str] + + +class Parser(ABC): + @abstractmethod + def do_parse(self, *args) -> parsedT: + raise NotImplementedError + + @abstractmethod + def to_observations(self, *args) -> list[ObservationPayloadT]: + raise NotImplementedError + + +class FileParser(Parser): + def __init__(self, settings: dict[str, Any]): + self.logger = logging.getLogger(self.__class__.__qualname__) + self.settings = settings + name = self.__class__.__name__ + self.logger.debug(f"parser settings in use with {name}: {self.settings}") + + @abstractmethod + def do_parse(self, rawdata: Any) -> pd.DataFrame: + raise NotImplementedError + + def to_observations( + self, data: pd.DataFrame, origin: str + ) -> list[ObservationPayloadT]: + observations = [] + + data.index.name = "result_time" + data.index = data.index.strftime("%Y-%m-%dT%H:%M:%S%Z") + + to_process = [val for _, val in data.items()] + + while to_process: + chunk = to_process.pop() + col = str(chunk.name) + if pd.api.types.is_numeric_dtype(chunk): + chunk.name = "result_number" + result_type = ObservationResultType.Number + elif pd.api.types.is_bool_dtype(chunk): + chunk.name = "result_bool" + result_type = ObservationResultType.Bool + elif pd.api.types.is_object_dtype(chunk): + # we need to handle object columns with special care + + # try to seperate out numerical values to account for data (transmission) errors + numeric_chunk = cast(pd.Series, pd.to_numeric(chunk, errors="coerce")) + if numeric_chunk.isna().all(): + # no numerical values found, we have a string only column + chunk.name = "result_string" + result_type = ObservationResultType.String + else: + # numerical values found -> the column consists of mixed data types, currently + # we only distinguish between numerical and string values, this could be + # improved in the future + str_chunk = cast( + pd.Series, chunk.loc[numeric_chunk.isna()].str.strip() + ) + to_process.extend((numeric_chunk, str_chunk)) + continue + else: + raise ParsingError( + f"Data of type {chunk.dtype} is not supported. " + f"In {origin or 'datafile'}, column {col}" + ) + + # we don't want to write NaN + chunk = chunk.dropna() + # add metadata + chunk = chunk.reset_index() + chunk["result_type"] = result_type + chunk["datastream_pos"] = str(col) + chunk["parameters"] = json.dumps({"origin": origin, "column_header": col}) + + observations.extend(chunk.to_dict(orient="records")) + + return observations + + +class CsvParser(FileParser): + def _cast_index(self, index: pd.Series, fmt: str) -> pd.DatetimeIndex: + try: + index = index.str.strip() + except AttributeError: + pass + out = pd.to_datetime(index, format=fmt, errors="coerce") + if out.isna().any(): + nat = out.isna() + warnings.warn( + f"Could not parse {nat.sum()} of {out.count()} timestamps " + f"with provided timestamp format {fmt!r}. First failing " + f"timestamp: '{index[nat].iloc[0]}'", + ParsingWarning, + ) + out.name = None + return pd.DatetimeIndex(out) + + def do_parse(self, rawdata: str) -> pd.DataFrame: + """ + Parse rawdata string to pandas.DataFrame + rawdata: the unparsed content + NOTE: + we need to preserve the original column numbering + and check for the date index column + """ + settings = self.settings.copy() + index_col = settings.pop("index_col") + + if "comment" in settings: + rawdata = filter_lines(rawdata, settings.pop("comment")) + + try: + df = pd.read_csv(StringIO(rawdata), **settings) + except (pd.errors.EmptyDataError, IndexError): # both indicate no data + df = pd.DataFrame() + + # We always use positions as header + df.columns = range(len(df.columns)) + + # remove all-nan columns as artifacts + df = df.dropna(axis=1, how="all") + if df.empty: + return pd.DataFrame(index=pd.DatetimeIndex([])) + + if index_col not in df.columns: + raise ParsingError( + f"Could not get Timestamp-Column {index_col}. " + f"Data has only {len(df.columns)} columns." + ) + + df.index = self._cast_index(df.pop(index_col), settings["date_format"]) + # remove rows with broken dates + df = df.loc[df.index.notna()] + + self.logger.debug(f"data.shape={df.shape}") + return df + + +# ============================================================ +# Mqtt Parser +# ============================================================ + + +@dataclass +class Observation: + # This is a legacy class of the datastore_lib + # see tsm_datastore_lib.Observation + timestamp: datetime + value: float | int | str | bool + origin: str + position: int + header: str = "" + + def __post_init__(self): + if self.value is None: + raise ValueError("None is not allowed as observation value.") + if math.isnan(self.value): + raise ValueError("NaN is not allowed as observation value.") + + +class MqttDataParser(Parser): + def __init__(self): + self.logger = logging.getLogger(self.__class__.__qualname__) + + @abstractmethod + def do_parse(self, rawdata: Any, origin: str) -> list[Observation]: + raise NotImplementedError + + def to_observations( + self, data: list[Observation], thing_uuid: str + ) -> list[ObservationPayloadT]: + result = [] + for ob in data: + obpay: ObservationPayloadT = { + "result_time": ob.timestamp, + "datastream_pos": str(ob.position), + "parameters": { + "origin": ob.origin, + "column_header": ob.header, + }, + "result_number": None, + "result_string": None, + "result_boolean": None, + "result_json": None, + } + if isinstance(ob.value, (float, int)): + obpay["result_number"] = ob.value + obpay["result_type"] = ObservationResultType.Number + elif isinstance(ob.value, str): + obpay["result_string"] = ob.value + obpay["result_type"] = ObservationResultType.String + elif isinstance(ob.value, bool): + obpay["result_boolean"] = ob.value # noqa + obpay["result_type"] = ObservationResultType.Bool + elif isinstance(ob.value, dict): + obpay["result_json"] = json.dumps(ob.value) + obpay["result_type"] = ObservationResultType.Json + else: + journal.warning( + f"Data of type {type(ob.value).__name__} is " + f"not supported. Failing Observation: {ob}", + thing_uuid, + ) + continue + result.append(obpay) + + return result + + +class CampbellCr6Parser(MqttDataParser): + # the basic data massage looked like this + # { + # "type": "Feature", + # "geometry": {"type": "Point", "coordinates": [null, null, null]}, + # "properties": { + # "loggerID": "CR6_18341", + # "observationNames": ["Batt_volt_Min", "PTemp"], + # "observations": {"2022-05-24T08:53:00Z": [11.9, 26.91]} + # } + # } + + def do_parse(self, rawdata: Any, origin: str = "", **kwargs) -> list[Observation]: + properties = rawdata.get("properties") + if properties is None: + return [] + + out = [] + for timestamp, values in properties["observations"].items(): + for i, (key, value) in enumerate( + zip(properties["observationNames"], values) + ): + out.append( + Observation( + timestamp=timestamp, + value=value, + position=i, + origin=origin, + header=key, + ) + ) + return out + + +class YdocMl417Parser(MqttDataParser): + # mqtt_ingest/test-logger-pb/test/data/jsn + # { + # "device": + # {"sn":99073020,"name":"UFZ","v":"4.2B5","imei":353081090730204,"sim":89490200001536167920}, + # "channels":[ + # {"code":"SB","name":"Signal","unit":"bars"}, + # {"code":"MINVi","name":"Min voltage","unit":"V"}, + # {"code":"AVGVi","name":"Average voltage","unit":"V"}, + # {"code":"AVGCi","name":"Average current","unit":"mA"}, + # {"code":"P1*","name":"pr2_1_10","unit":"m3/m3"}, + # {"code":"P2","name":"pr2_1_20","unit":"m3/m3"}, + # {"code":"P3","name":"pr2_1_30","unit":"m3/m3"}, + # {"code":"P4","name":"pr2_1_40","unit":"m3/m3"}, + # {}], + # "data":[ + # {"$ts":230116110002,"$msg":"WDT;pr2_1"}, <== we ignore that (*) + # {"$ts":230116110002,"MINVi":3.74,"AVGVi":3.94,"AVGCi":116,"P1*":"0*T","P2":"0*T","P3":"0*T","P4":"0*T"}, + # {}]} + + def do_parse(self, rawdata: Any, origin: str = "", **kwargs) -> list[Observation]: + if "data/jsn" not in origin: + return [] + + # data = payload['data'][1] + ret = [] + for data in rawdata["data"]: + try: + ts = datetime.strptime(str(data["$ts"]), "%y%m%d%H%M%S") + ob0 = Observation(ts, data["MINVi"], origin, 0, header="MINVi") + ob1 = Observation(ts, data["AVGVi"], origin, 1, header="AVGCi") + ob2 = Observation(ts, data["AVGCi"], origin, 2, header="AVGCi") + ob3 = Observation(ts, data["P1*"], origin, 3, header="P1*") + ob4 = Observation(ts, data["P2"], origin, 4, header="P2") + ob5 = Observation(ts, data["P3"], origin, 5, header="P3") + ob6 = Observation(ts, data["P4"], origin, 6, header="P4") + ret.extend([ob0, ob1, ob2, ob3, ob4, ob5, ob6]) + except KeyError: + # we ignore data that not have all keys + # see also the example above the function at (*) + pass + return ret + + +class BrightskyDwdApiParser: + def do_parse(self, rawdata: Any, origin: str = "", **kwargs) -> list[Observation]: + weather = rawdata["weather"] + timestamp = weather.pop("timestamp") + source = rawdata["sources"][0] + out = [] + for prop, value in weather.items(): + try: + out.append( + Observation( + timestamp=timestamp, + value=value, + position=prop, + origin=origin, + header=source, + ) + ) + except ValueError: # value is NaN or None + continue + + return out + + +class SineDummyParser: + def do_parse(self, rawdata: Any, origin: str = "", **kwargs) -> list[Observation]: + timestamp = datetime.now() + return [ + Observation( + timestamp=timestamp, + value=rawdata["sine"], + position=0, + origin=origin, + header="sine", + ), + Observation( + timestamp=timestamp, + value=rawdata["cosine"], + position=1, + origin=origin, + header="cosine", + ), + ] + + +def get_parser(parser_type, settings) -> FileParser | MqttDataParser: + types = { + "csvparser": CsvParser, + "campbell_cr6": CampbellCr6Parser, + "brightsky_dwd_api": BrightskyDwdApiParser, + "ydoc_ml417": YdocMl417Parser, + "sine_dummy": SineDummyParser, + } + klass = types.get(parser_type) + if klass is None: + raise NotImplementedError(f"parser {parser_type!r} not known") + + if issubclass(klass, FileParser): + if not settings: + settings = {} + default_settings = { + "comment": "#", + "decimal": ".", + "na_values": None, + "encoding": "utf-8", + "engine": "python", + "on_bad_lines": "warn", + "header": None, + } + + kwargs = settings.pop("pandas_read_csv") or {} + settings = {**default_settings, **kwargs, **settings} + settings["index_col"] = settings.pop("timestamp_column") + settings["date_format"] = settings.pop("timestamp_format") + return klass(settings) + return klass() diff --git a/src/timeio/qualcontrol.py b/src/timeio/qualcontrol.py new file mode 100755 index 00000000..315e56db --- /dev/null +++ b/src/timeio/qualcontrol.py @@ -0,0 +1,844 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import datetime +import json +import logging +import subprocess +import sys +import typing +import warnings +from typing import Any, Hashable, Literal, TypedDict, cast + +import pandas as pd +import requests +import saqc +from psycopg import Connection, sql +from psycopg.rows import dict_row +from saqc import DictOfSeries, Flags + +try: + from psycopg_pool import ConnectionPool +except ImportError: + ConnectionPool = typing.TypeVar("ConnectionPool") # noqa + + +from utils.common import ObservationResultType +from utils.errors import DataNotFoundError, UserInputError, NoDataWarning +from utils.types import ConfDB, DbRowT, JsonObjectT +from utils.journaling import Journal + +try: + import tsm_user_code # noqa, this registers user functions on SaQC +except ImportError: + warnings.warn("could not import module 'tsm_user_code'") + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +TimestampT = typing.Union[datetime.datetime.timestamp, pd.Timestamp] + +journal = Journal("QualityControl") + +_OBS_COLUMNS = [ + "result_time", + "result_type", + "result_number", + "result_string", + "result_json", + "result_boolean", + "result_quality", +] + + +def update_timeio_user_code(): + """ + Function to install/upgrade the python package [1], that + provide the custom saqc user code during runtime. + + References + ---------- + [1]: https://codebase.helmholtz.cloud/ufz-tsm/tsm-dataprocessing-extension + """ + + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "-U", + "git+https://codebase.helmholtz.cloud/ufz-tsm/tsm-dataprocessing-extension.git", + ] + ) + + +def dict_update_to_list(dict_: dict, key: Hashable, value: Any) -> None: + """ + Updates dict values inplace. Present keys are updated by casting + the current value to a list, which then contains the current and + the new value. + + d = {"a": 0} + dict_update_to_list(d, "a", 99) + d ==> {"a": [0, 99]} + dict_update_to_list(d, "a", 99) + d ==> {"a": [0, 99, 99]} + """ + if (curr := dict_.get(key)) is None: + dict_[key] = value + elif isinstance(curr, list): + dict_[key].append(value) + else: + dict_[key] = [dict_[key], value] + + +def check_keys_by_TypedDict(value: dict, expected: type[typing.TypedDict], name: str): + missing = expected.__required_keys__ - value.keys() + if missing: + raise KeyError(f"{', '.join(missing)} are a mandatory keys for {name!r}") + + +def ping_dbapi(base_url): + r = requests.get(f"{base_url}/health") + r.raise_for_status() + + +class KwargsScheme(saqc.core.core.FloatScheme): + + @staticmethod + def get_meta_per_row(history: saqc.core.history.History) -> pd.DataFrame: + """ + Returns a dataframe with the following columns: + - func: str - name of last applied function (might be an empty str) + - kwargs: dict - kwargs of last applied function (might be an empty dict) + """ + + # meta: (list of dicts with keys: func, args, kwargs) + # [{...}, {...}, {...}] + # history -> idx -> meta lookup + # 0 1 2 + # nan 25 30 -> 2 -> func/kwargs from meta[2] + # nan 25 nan -> 1 -> func/kwargs from meta[1] + # nan nan nan -> nan -> set func/kwargs to empty values + # RESULT: + # flag func kwargs + # 30 "flagBar" {kw3: v3, kw4: v4, ...} + # 25 "flagFoo" {kw1: v1, kw2: v2, ...} + # nan "" {} + def map_meta(i, meta) -> pd.Series: + if pd.isna(i): + return pd.Series({"func": "", "kwargs": {}}) + m = meta[int(i)] + return pd.Series({"func": m.get("func", ""), "kwargs": m.get("kwargs", {})}) + + meta_idx = history.hist.astype(float).agg(pd.Series.last_valid_index, axis=1) + return meta_idx.apply(map_meta, meta=history.meta) + + def toExternal(self, flags: Flags, attrs: dict | None = None) -> DictOfSeries: + """ + Returns a DictOfSeries with elements of type pd.Dataframe with + the following columns: + - flag: float - the float quality label + - func: str - name of last applied function (might be empty str) + - kwargs: dict - kwargs of last applied function (might be empty) + + For unflagged data points (no saqc quality function set a flag) the + flag will be `saqc.UNFLAGGED`. The corresponding func and kwargs will + be empty. + """ + tflags = super().toExternal(flags, attrs) + out = DictOfSeries() + columns = pd.Index(["flag", "func", "kwargs"]) + for key, value in tflags.items(): + if tflags[key].empty: + out[key] = pd.DataFrame(index=tflags[key].index, columns=columns) + continue + df = self.get_meta_per_row(flags.history[key]) + df["flag"] = tflags[key].fillna(saqc.UNFLAGGED) + out[key] = df[columns].copy() # reorder + return out + + +class QualityControl: + conn: Connection + api_url: str + thing: TypedDict("ThingT", {"id": int, "name": str, "uuid": str}) + proj: ConfDB.ProjectT + schema: str + conf: ConfDB.QaqcT + tests: list[ConfDB.QaqcTestT] + window: pd.Timedelta | int + legacy: bool + + def __init__(self, conn: Connection, dbapi_url: str, thing_uuid: str): + self.conn: Connection = conn + self.api_url = dbapi_url + self.schema = self.fetch_schema(thing_uuid) + self.proj = self.fetch_project(thing_uuid) + self.thing = self.fetch_thing(thing_uuid) + if not self.thing: + raise DataNotFoundError(f"A thing with UUID {thing_uuid} does not exist") + self.conf = self.fetch_qaqc_config(thing_uuid) + if not self.conf: + raise NoDataWarning( + f"No qaqc config present in project {self.proj['name']}" + ) + self.tests = self.fetch_qaqc_tests(self.conf["id"]) + self.window = self.parse_ctx_window(self.conf["context_window"]) + self.legacy = any(map(lambda t: t.get("position") is not None, self.tests)) + + @staticmethod + def extract_data_by_result_type(df: pd.DataFrame) -> pd.Series: + """Selects the column, specified as integer in the column 'result_type'.""" + if df.empty: + return pd.Series(index=df.index, dtype=float) + dtype_mapping = { + "result_type": NotImplemented, + "result_number": float, + "result_string": str, + "result_boolean": bool, + "result_json": object, + } + columns = pd.Index(dtype_mapping.keys()) + df = df[columns].copy(deep=False) + + # If we have a single result-type we can map a matching dtype + if len(rt := df["result_type"].value_counts()) == 1: + column = columns[rt.index[0] + 1] + return df[column].astype(dtype_mapping[column]) + + # We can't use raw=True (numpy-speedup) because the numpy arrays + # don't always preserve the dtype. E.g. a bool might be cast + # to float if another value is a float. + return df.apply(lambda row: row.iloc[1 + row.iloc[0]], axis=1) + + @staticmethod + def parse_ctx_window(window: int | str) -> pd.Timedelta | int: + """Parse the `context_window` value of the config.""" + if isinstance(window, int) or isinstance(window, str) and window.isnumeric(): + window = int(window) + is_negative = window < 0 + else: + window = pd.Timedelta(window) + is_negative = window.days < 0 + + if is_negative: + raise UserInputError( + "Parameter 'context_window' must have a non negative value" + ) + return window + + @staticmethod + def compose_saqc_name(sta_thing_id: int, sta_stream_id: int) -> str: + return f"T{sta_thing_id}S{sta_stream_id}" + + @staticmethod + def parse_saqc_name(name: str) -> tuple[int, int] | tuple[None, None]: + # eg. T42S99 + name = name[1:] # rm 'T' + t, *ds = name.split("S", maxsplit=1) + return (int(t), int(ds[0])) if ds else (None, None) + + def fetch_qaqc_config(self, thing_uuid) -> ConfDB.QaqcT | None: + q = ( + "SELECT q.* FROM config_db.qaqc q " + "JOIN config_db.project p ON q.project_id = p.id " + "JOIN config_db.thing t ON p.id = t.project_id " + "WHERE t.uuid = %s ORDER BY q.id DESC " # we want the last config + ) + with self.conn.cursor(row_factory=dict_row) as cur: + return cur.execute(cast(Literal, q), [thing_uuid]).fetchone() + + def fetch_qaqc_tests(self, qaqc_id: int) -> list[ConfDB.QaqcTestT]: + q = "SELECT * FROM config_db.qaqc_test qt WHERE qt.qaqc_id = %s" + with self.conn.cursor(row_factory=dict_row) as cur: + return cur.execute(cast(Literal, q), [qaqc_id]).fetchall() + + def fetch_project(self, thing_uuid: str) -> ConfDB.ProjectT: + """Returns project UUID and project name for a given thing.""" + q = ( + "SELECT p.* FROM config_db.project p " + "JOIN config_db.thing t ON p.id = t.project_id " + "WHERE t.uuid = %s" + ) + with self.conn.cursor(row_factory=dict_row) as cur: + return cur.execute(cast(Literal, q), [thing_uuid]).fetchone() + + def fetch_schema(self, thing_uuid) -> str: + return self.conn.execute( + "SELECT schema FROM public.schema_thing_mapping WHERE thing_uuid = %s", + [thing_uuid], + ).fetchone()[0] + + def fetch_thing(self, thing_uuid: str): + q = sql.SQL( + 'select "id", "name", "uuid" from {schema}.thing where "uuid" = %s' + ).format(schema=sql.Identifier(self.schema)) + with self.conn.cursor(row_factory=dict_row) as cur: + return cur.execute(q, [thing_uuid]).fetchone() + + def fetch_thing_uuid_for_sta_stream(self, sta_stream_id: int): + q = ( + "select thing_id as thing_uuid from public.datastream_link where " + "device_property_id = %s" + ) + row = self.conn.execute(cast(Literal, q), [sta_stream_id]).fetchone() + return row and row[0] + + def fetch_datastream_by_pos(self, thing_uuid, position) -> dict[str, Any]: + query = ( + "SELECT ds.id, ds.name, ds.position, ds.thing_id " + "FROM {schema}.datastream ds " + "JOIN {schema}.thing t ON t.id = ds.thing_id " + "WHERE t.uuid = %s " + "AND ds.position = %s" + ) + # Note that 'position' is of type varchar in DB + with self.conn.cursor(row_factory=dict_row) as cur: + return cur.execute( + sql.SQL(cast(Literal, query)).format( + schema=sql.Identifier(self.schema) + ), + [thing_uuid, str(position)], + ).fetchone() + + def fetch_datastream_data_legacy( + self, + datastream_id: int, + start_date: TimestampT | None, + end_date: TimestampT | None, + window: pd.Timedelta | int | None, + ) -> list[DbRowT] | None: + + # (start_date - window) <= start_date <= data <= end_date + # [===context window===]+[===============data============] + if isinstance(window, pd.Timedelta) and start_date is not None: + start_date = start_date - window + if start_date is None: + start_date = "-Infinity" + if end_date is None: + end_date = "Infinity" + + query = sql.SQL( + cast( + Literal, + "select {fields} from {schema}.observation o " + "where o.datastream_id = %s " + "and o.result_time >= %s " + "and o.result_time <= %s " + "order by o.result_time desc " + "limit %s", + ) + ).format( + fields=sql.SQL(", ").join(map(sql.Identifier, _OBS_COLUMNS)), + schema=sql.Identifier(self.schema), + ) + + # Fetch data by dates including context window, iff it was defined + # as a timedelta. None as limit, becomes SQL:'LIMIT NULL' which is + # equivalent to 'LIMIT ALL'. + params = [datastream_id, start_date, end_date, None] + data = self.conn.execute(query, params).fetchall() + if not data: # If we have no data we also need no context data + return data + + # Fetch data from context window, iff it was passed as a number, + # which means number of observations before the actual data. + context = [] + if isinstance(window, int) and window > 0: + params = [datastream_id, "-Infinity", start_date, window] + context = self.conn.execute(query, params).fetchall() + # If the exact start_date is present in the data, the usage + # of `>=` and `<=` will result in a duplicate row. + if len(context) > 0 and context[0][0] == data[-1][0]: + context = context[1:] + + # In the query we ordered descending (newest first) to correctly + # use the limit keyword, but for python/pandas etc. we want to + # return an ascending (oldest first) data set. + return context[::-1] + data[::-1] + + def fetch_datastream_data_sta( + self, + sta_stream_id: int, + start_date: TimestampT | None, + end_date: TimestampT | None, + window: pd.Timedelta | int | None, + ) -> list[DbRowT] | None: + + # (start_date - window) <= start_date <= data <= end_date + # [===context window===]+[===============data============] + if isinstance(window, pd.Timedelta) and start_date is not None: + start_date = start_date - window + if start_date is None: + start_date = "-Infinity" + if end_date is None: + end_date = "Infinity" + + # Mind that o."DATASTREAM_ID" is the STA datastream id + query = sql.SQL( + cast( + Literal, + "select {fields}, " + "l.datastream_id as raw_datastream_id " + 'from {schema}."OBSERVATIONS" o ' + "join public.sms_datastream_link l " + 'on o."DATASTREAM_ID" = l.device_property_id ' + 'where o."DATASTREAM_ID" = %s ' + 'and o."RESULT_TIME" >= %s ' + 'and o."RESULT_TIME" <= %s ' + 'order by o."RESULT_TIME" desc ' + "limit %s", + ) + ).format( + fields=sql.SQL(", ").join( + map(sql.Identifier, map(str.upper, _OBS_COLUMNS)) + ), + schema=sql.Identifier(self.schema), + ) + + # Fetch data by dates including context window, iff it was defined + # as a timedelta. None as limit, becomes SQL:'LIMIT NULL' which is + # equivalent to 'LIMIT ALL'. + params = [sta_stream_id, start_date, end_date, None] + data = self.conn.execute(query, params).fetchall() + if not data: # If we have no data we also need no context data + return data + + # Fetch data from context window, iff it was passed as a number, + # which means number of observations before the actual data. + context = [] + if isinstance(window, int) and window > 0: + params = [sta_stream_id, "-Infinity", start_date, window] + context = self.conn.execute(query, params).fetchall() + # If the exact start_date is present in the data, the usage + # of `>=` and `<=` will result in a duplicate row. + if len(context) > 0 and context[0][0] == data[-1][0]: + context = context[1:] + + # In the query we ordered descending (newest first) to correctly + # use the limit keyword, but for python/pandas etc. we want to + # return an ascending (oldest first) data set. + return context[::-1] + data[::-1] + + def fetch_unflagged_daterange_legacy( + self, datastream_id + ) -> tuple[TimestampT, TimestampT] | tuple[None, None]: + """Returns (first aka. earliest, last) timestamp of unflagged data.""" + part = ( + "select o.result_time from {schema}.observation o " + "where o.datastream_id = %s and " + "(o.result_quality is null or o.result_quality = 'null'::jsonb)" + "order by result_time {order} limit 1" + ) + newest = sql.SQL(part).format( + schema=sql.Identifier(self.schema), order=sql.SQL("desc") + ) + oldest = sql.SQL(part).format( + schema=sql.Identifier(self.schema), order=sql.SQL("asc") + ) + query = sql.SQL("({}) UNION ALL ({}) ORDER BY result_time").format( + oldest, newest + ) + r = self.conn.execute(query, [datastream_id, datastream_id]).fetchall() + if not r: + return None, None + return r[0][0], r[1][0] + + def fetch_unflagged_daterange_sta( + self, sta_stream_id + ) -> tuple[TimestampT, TimestampT] | tuple[None, None]: + """Returns (first aka. earliest, last) timestamp of unflagged data.""" + + # Mind that o."DATASTREAM_ID" is the STA datastream id + part = """\ + select o."RESULT_TIME" from {schema}."OBSERVATIONS" o + where o."DATASTREAM_ID" = %s and + (o."RESULT_QUALITY" is null or o."RESULT_QUALITY" = 'null'::jsonb) + order by "RESULT_TIME" {order} limit 1 + """ + newest = sql.SQL(part).format( + schema=sql.Identifier(self.schema), order=sql.SQL("desc") + ) + oldest = sql.SQL(part).format( + schema=sql.Identifier(self.schema), order=sql.SQL("asc") + ) + query = sql.SQL('({}) UNION ALL ({}) ORDER BY "RESULT_TIME"').format( + oldest, newest + ) + r = self.conn.execute(query, [sta_stream_id, sta_stream_id]).fetchall() + if not r: + return None, None + return r[0][0], r[1][0] + + def qaqc_legacy(self) -> tuple[saqc.SaQC, dict[str, pd.DataFrame]]: + """ + Returns a tuple of data in saqc.SaQC and a metadata dict. + + For each raw data (in contrast to derived data products) the + metadata dict contains a pd.Dataframe (DF). + + The DF id indexed with the timestamps of the data. Note that the + context window is not part of this index. + The columns of the DF are: + - 'thing_uuid': thing_uuid of the data point at this index + - 'datastream_id': datastream id of the data point at this index + """ + + def fetch_data(pos) -> tuple[pd.Series, pd.DataFrame]: + ds = self.fetch_datastream_by_pos(self.thing["uuid"], pos) + earlier, later = self.fetch_unflagged_daterange_legacy(ds["id"]) + obs = None + if earlier is not None: # else no unflagged data + obs = self.fetch_datastream_data_legacy( + ds["id"], earlier, later, self.window + ) + df = pd.DataFrame(obs or [], columns=_OBS_COLUMNS) + df = df.set_index("result_time", drop=True) + df.index = pd.DatetimeIndex(df.index) + data = self.extract_data_by_result_type(df) + # we truncate the context window + if data.empty: + data_index = data.index + else: + data_index = data.index[data.index.slice_indexer(earlier, later)] + meta = pd.DataFrame(index=data_index) + meta.attrs = { + "repr_name": f"Datastream {ds['name']} of Thing {self.thing['name']}" + } + meta["thing_uuid"] = self.thing["uuid"] + meta["datastream_id"] = ds["id"] + return data, meta + + qc = saqc.SaQC(scheme=KwargsScheme()) + md = dict() # metadata + for i, test in enumerate(self.tests): + test: ConfDB.QaqcTestT + origin = f"QA/QC Test #{i+1}: {self.conf['name']}/{test['name']}" + + # Raise for bad function, before fetching all the data + attr = test["function"] + try: + func = getattr(qc, attr) + except AttributeError: + raise UserInputError(f"{origin}: Unknown SaQC function {attr}") + + kwargs: dict = test.get("args", {}).copy() + if (position := test.get("position", None)) is not None: + if (name := str(position)) not in qc: + data, meta = fetch_data(position) + qc[name] = data + md[name] = meta + if "field" in kwargs: + logger.warning( + "argument 'field' is ignored and will be overwritten, " + "if legacy position is also given." + ) + kwargs["field"] = name + + # run QC + try: + qc = func(**kwargs) + except Exception as e: + raise UserInputError( + f"{origin}: Execution of test failed, because {e}" + ) from e + + return qc, md + + def qaqc_sta(self) -> tuple[saqc.SaQC, dict[str, pd.DataFrame]]: + + def fetch_sta_data(thing_id: int, stream_id): + thing_uuid = self.fetch_thing_uuid_for_sta_stream(stream_id) + earlier, later = self.fetch_unflagged_daterange_sta(stream_id) + obs = None + if earlier is not None: # else no unflagged data + obs = self.fetch_datastream_data_sta( + stream_id, earlier, later, self.window + ) + df = pd.DataFrame(obs or [], columns=_OBS_COLUMNS + ["raw_datastream_id"]) + df = df.set_index("result_time", drop=True) + df.index = pd.DatetimeIndex(df.index) + data = self.extract_data_by_result_type(df) + # we truncate the context window + if data.empty: + data_index = data.index + else: + data_index = data.index[data.index.slice_indexer(earlier, later)] + meta = pd.DataFrame(index=data_index) + meta.attrs = {"repr_name": f"Datastream {stream_id} of Thing {thing_id}"} + meta["thing_uuid"] = thing_uuid + meta["datastream_id"] = df["raw_datastream_id"] + return data, meta + + qc = saqc.SaQC(scheme=KwargsScheme()) + md = dict() # metadata + for i, test in enumerate(self.tests): + test: ConfDB.QaqcTestT + origin = f"QA/QC Test #{i+1}: {self.conf['name']}/{test['name']}" + + # Raise for bad function, before fetching all the data + attr = test["function"] + try: + func = getattr(qc, attr) + except AttributeError: + raise UserInputError(f"{origin}: Unknown SaQC function {attr}") + + # fetch the relevant data + kwargs: dict = test.get("args", {}).copy() + for stream in test["streams"]: + stream: ConfDB.QaqcTestStreamT + arg_name = stream["arg_name"] + tid, sid = stream["sta_thing_id"], stream["sta_stream_id"] + alias = stream["alias"] + if tid is None or sid is None: + dict_update_to_list(kwargs, arg_name, alias) + continue + if alias not in qc: + data, meta = fetch_sta_data(tid, sid) + qc[alias] = data + md[alias] = meta + dict_update_to_list(kwargs, arg_name, alias) + + # run QC + try: + qc = func(**kwargs) + except Exception as e: + raise UserInputError( + f"{origin}: Execution of test failed, because {e}" + ) from e + + return qc, md + + def qacq_for_thing(self): + """ + Run QA/QC on data in the Observation-DB. + + Returns the number of observation that was updated and/or created. + """ + logger.info(f"Execute qaqc config {self.conf['name']!r}") + if not self.tests: + raise NoDataWarning( + f"No quality functions present in config {self.conf['name']!r}", + ) + + if self.legacy: + qc, meta = self.qaqc_legacy() + else: + qc, meta = self.qaqc_sta() + + # ============= legacy dataproducts ============= + # Data products must be created before quality labels are uploaded. + # If we first do the upload and an error occur we will not be able to + # recreate the same data for the dataproducts, this is because a second + # run ignores already flagged data. + n = 0 + dp_columns = [c for c in qc.columns if c not in meta.keys()] + if self.legacy and dp_columns: + n += self._create_dataproducts_legacy(qc[dp_columns]) + + m = self._upload(qc, meta) + return n + m + + def _upload(self, qc: saqc.SaQC, meta: dict[str, pd.DataFrame]): + total = 0 + flags = qc.flags # implicit flags translation -> KwargsScheme + for name in flags.columns: + + if name not in meta.keys(): + # Either the variable is just a temporary variable + # or we have a legacy dataproduct, which are handled + # by another function. + continue + repr_name = meta[name].attrs["repr_name"] + flags_frame: pd.DataFrame = flags[name] + flags_frame["flag"] = flags_frame["flag"].fillna(saqc.UNFLAGGED) + + # remove context window + flags_frame = flags_frame.reindex(meta[name]["thing_uuid"].index) + if flags_frame.empty: + logger.debug(f"no new quality labels for {repr_name}") + continue + + flags_frame["thing_uuid"] = meta[name]["thing_uuid"] + flags_frame["datastream_id"] = meta[name]["datastream_id"] + + for uuid, group in flags_frame.groupby("thing_uuid", sort=False): + labels = self._create_quality_labels(group, self.conf["id"]) + + try: + self._upload_quality_labels(uuid, labels) + except requests.HTTPError as e: + if logger.isEnabledFor(logging.DEBUG): + detail = e.response.json().get("detail", None) + if isinstance(detail, list): + detail = "\n ".join(str(d) for d in detail) + logger.debug(f"Error Detail:\n {detail}") + raise RuntimeError( + f"uploading quality labels for variable {repr_name} failed" + ) from e + + logger.debug(f"uploaded {len(labels)} quality labels for {repr_name}") + total += len(labels) + + return total + + def _upload_quality_labels(self, thing_uuid, qlabels: list[JsonObjectT]): + r = requests.post( + f"{self.api_url}/observations/qaqc/{thing_uuid}", + json={"qaqc_labels": qlabels}, + headers={"Content-type": "application/json"}, + ) + r.raise_for_status() + + def _upload_dataproduct(self, thing_uuid, obs: list[JsonObjectT]): + r = requests.post( + f"{self.api_url}/observations/upsert/{thing_uuid}", + json={"observations": obs}, + headers={"Content-type": "application/json"}, + ) + r.raise_for_status() + + @staticmethod + def _create_quality_labels(flags_df: pd.DataFrame, config_id) -> list[JsonObjectT]: + + def compose_json(row: pd.Series) -> JsonObjectT: + # The series has the following index labels: + # 'flag': the quality flag/label + # 'func': the function name that applied the flag + # 'kwargs': the kwargs that was passed to the function + # 'datastream_id': the datastream ID of the data + # maybe more columns for internal use + return { + "result_time": row.name.isoformat(), # noqa, index label + "datastream_id": row["datastream_id"], + "result_quality": json.dumps( + { + "annotation": str(row["flag"]), + "annotationType": "SaQC", + "properties": { + "version": saqc.__version__, + "measure": row["func"], + "configuration": config_id, + "userLabel": row["kwargs"].get("label", None), + }, + } + ), + } + + return flags_df.apply(compose_json, axis=1).to_list() + + @staticmethod + def _create_dataproduct_legacy( + df: pd.DataFrame, name, config_id + ) -> list[JsonObjectT]: + + assert pd.Index(["data", "flag", "func", "kwargs"]).difference(df.columns).empty + + if pd.api.types.is_numeric_dtype(df["data"]): + result_type = ObservationResultType.Number + result_field = "result_number" + elif pd.api.types.is_string_dtype(df["data"]): + result_type = ObservationResultType.String + result_field = "result_string" + elif pd.api.types.is_bool_dtype(df["data"]): + result_type = ObservationResultType.Bool + result_field = "result_bool" + elif pd.api.types.is_object_dtype(df["data"]): + result_type = ObservationResultType.Json + result_field = "result_json" + else: + raise UserInputError(f"data of type {df['data'].dtype} is not supported") + + def compose_json(row: pd.Series) -> JsonObjectT: + val = row["data"] + if result_type == ObservationResultType.Json: + val = json.dumps(val) + return { + "result_time": row.name.isoformat(), # noqa, index label + "result_type": result_type, + result_field: val, + "result_quality": json.dumps( + { + "annotation": str(row["flag"]), + "annotationType": "SaQC", + "properties": { + "version": saqc.__version__, + "measure": row["func"], + "configuration": config_id, + "userLabel": row["kwargs"].get("label", None), + }, + } + ), + "datastream_pos": name, + } + + valid = df["data"].notna() + return df[valid].apply(compose_json, axis=1).dropna().to_list() + + def _create_dataproducts_legacy(self, qc): + total = 0 + flags, data = qc.flags, qc.data # implicit flags translation ->KwargsScheme + for name in flags.columns: + df: pd.DataFrame = flags[name] + df["data"] = data[name] + if df.empty: + logger.debug(f"no data for data product {name}") + continue + obs = self._create_dataproduct_legacy(df, name, self.conf["id"]) + self._upload_dataproduct(self.thing["uuid"], obs) + logger.info(f"uploaded {len(obs)} data points for dataproduct {name!r}") + total += len(obs) + continue + return total + + +def qacq( + thing_uuid: str, + dbapi_url: str, + dsn_or_pool: str | ConnectionPool, + **kwargs, +): + """ + Run QA/QC on data in the Observation-DB. + + First the QAQC configuration is fetched for a given Thing. + In the legacy workflow + + Parameters + ---------- + thing_uuid : str + The UUID of the thing that triggered the QA/QC. + + dbapi_url : str + Base URL of the DB-API. + + dsn_or_pool : str or psycopg_pool.ConnectionPool + If a pool is passed, a connection from the pool is used. + If a string is passed, a connection is created with the string as DSN. + + kwargs : + If ``dsn_or_pool`` is a ``psycopg_pool.ConnectionPool`` all kwargs + are passed to its ``connect`` method. + Otherwise, all kwargs are passed to ``psycopg.Connection.connect()``. + + Returns + ------- + n: int + Number of observation updated and/or created. + """ + ping_dbapi(dbapi_url) + + if isinstance(dsn_or_pool, str): + conn_setup = Connection.connect + kwargs = {"conninfo": dsn_or_pool, **kwargs} + else: + conn_setup = dsn_or_pool.connection + + with conn_setup(**kwargs) as conn: + logger.info("successfully connected to configdb") + qaqc = QualityControl(conn, dbapi_url, thing_uuid) + return qaqc.qacq_for_thing() diff --git a/src/timeio/run_qc.py b/src/timeio/run_qc.py new file mode 100644 index 00000000..1ccf8017 --- /dev/null +++ b/src/timeio/run_qc.py @@ -0,0 +1,81 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import annotations + +import json +import logging + +from paho.mqtt.client import MQTTMessage + +import databases +from base_handler import AbstractHandler +from qualcontrol import QualityControl +from utils import get_envvar, setup_logging +from utils.errors import DataNotFoundError, UserInputError, NoDataWarning +from utils.journaling import Journal + +logger = logging.getLogger("run-quality-control") +journal = Journal("QualityControl") + + +class QcHandler(AbstractHandler): + def __init__(self): + super().__init__( + topic=get_envvar("TOPIC"), + mqtt_broker=get_envvar("MQTT_BROKER"), + mqtt_user=get_envvar("MQTT_USER"), + mqtt_password=get_envvar("MQTT_PASSWORD"), + mqtt_client_id=get_envvar("MQTT_CLIENT_ID"), + mqtt_qos=get_envvar("MQTT_QOS", cast_to=int), + mqtt_clean_session=get_envvar("MQTT_CLEAN_SESSION", cast_to=bool), + ) + self.publish_topic = get_envvar("TOPIC_QC_DONE") + self.publish_qos = get_envvar("TOPIC_QC_DONE_QOS", cast_to=int) + self.db = databases.Database(get_envvar("DATABASE_DSN")) + self.dbapi = databases.DBapi(get_envvar("DB_API_BASE_URL")) + + def act(self, content: dict, message: MQTTMessage): + if (thing_uuid := content.get("thing_uuid")) is None: + raise DataNotFoundError( + "mandatory field 'thing_uuid' is not present in data" + ) + logger.info(f"Thing {thing_uuid} triggered QAQC service") + + self.dbapi.ping_dbapi() + with self.db.connection() as conn: + logger.info("successfully connected to configdb") + try: + qaqc = QualityControl(conn, self.dbapi.base_url, thing_uuid) + except NoDataWarning as w: + # TODO: uncomment if QC is production-ready + # journal.warning(str(w), thing_uuid) + raise w + try: + some = qaqc.qacq_for_thing() + except UserInputError as e: + journal.error(str(e), thing_uuid) + raise e + except NoDataWarning as w: + journal.warning(str(w), thing_uuid) + raise w + + if some: + journal.info(f"QC done. Config: {qaqc.conf['name']}", thing_uuid) + else: + journal.warning( + f"QC done, but no quality labels were generated. " + f"Config: {qaqc.conf['name']}", + thing_uuid, + ) + return + + logger.debug(f"inform downstream services about success of qc.") + payload = json.dumps({"thing": thing_uuid}) + self.mqtt_client.publish( + topic=self.publish_topic, payload=payload, qos=self.publish_qos + ) + + +if __name__ == "__main__": + setup_logging(get_envvar("LOG_LEVEL", "INFO")) + QcHandler().run_loop() diff --git a/src/timeio/thing.py b/src/timeio/thing.py new file mode 100644 index 00000000..7e56f924 --- /dev/null +++ b/src/timeio/thing.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import frost + + +class Database: + def __init__( + self, + username: str, + password: str, + url: str, + ro_username: str, + ro_password: str, + ro_url: str, + ): + self.username = username + self.password = password + self.url = url + self.ro_username = ro_username + self.ro_password = ro_password + self.ro_url = ro_url + + @classmethod + def get_instance(cls, message: dict) -> Database: + try: + return cls( + message["username"], + message["password"], + message["url"], + message["ro_username"], + message["ro_password"], + message["ro_url"], + ) + except KeyError as e: + raise ValueError( + f'Unable to get Database instance from message "{message}"' + ) from e + + +class Project: + def __init__(self, uuid: str, name: str) -> None: + self.uuid = uuid + self.name = name + + @classmethod + def get_instance(cls, message: dict) -> Project: + try: + return cls(message["uuid"], message["name"]) + except KeyError as e: + raise ValueError( + f'Unable to get Project instance from message "{message}"' + ) from e + + +class RawDataStorage: + def __init__( + self, + username: str, + password: str, + bucket_name: str, + filename_pattern: str | None = None, + ): + self.username = username + self.password = password + self.bucket_name = bucket_name + self.filename_pattern = filename_pattern + + @classmethod + def get_instance(cls, message: dict) -> RawDataStorage: + try: + return cls( + message["username"], + message["password"], + message["bucket_name"], + message.get("filename_pattern", None), + ) + except KeyError as e: + raise ValueError( + f'Unable to get RawDataStorage instance from message "{message}"' + ) from e + + +class ExternalSFTP: + def __init__( + self, + enabled: bool, + uri: str, + path: str, + username: str, + password: str, + sync_interval: int, + public_key: str, + private_key_path: str, + ): + self.enabled = enabled + self.uri = uri + self.path = path + self.username = username + self.password = password + self.sync_interval = sync_interval + self.public_key = public_key + self.private_key_path = private_key_path + + @classmethod + def get_instance(cls, message: dict) -> ExternalSFTP: + try: + return cls( + message["enabled"], + message["uri"], + message["path"], + message["username"], + message["password"], + message["sync_interval"], + message["public_key"], + message["private_key_path"], + ) + except KeyError as e: + raise ValueError( + f'Unable to get ExternalSFTP instance from message "{message}"' + ) from e + + +class ExternalApi: + def __init__( + self, enabled: bool, api_type: str, sync_interval: int, settings: dict + ): + self.enabled = enabled + self.api_type = api_type + self.sync_interval = sync_interval + self.settings = settings + + @classmethod + def get_instance(cls, message: dict) -> ExternalApi: + try: + return cls( + message["enabled"], + message["type"], + message["sync_interval"], + message["settings"], + ) + except KeyError as e: + raise ValueError( + f'Unable to get ExternalAPI instance from message "{message}"' + ) from e + + +class Thing: + def __init__( + self, + uuid: str, + name: str | None = None, + project: Project | None = None, + database: Database | None = None, + raw_data_storage: RawDataStorage | None = None, + external_sftp: ExternalSFTP | None = None, + external_api: ExternalApi | None = None, + description: str | None = None, + properties: dict | None = None, + ): + self.uuid = uuid + self.name = name + self.project = project + self.database = database + self.raw_data_storage = raw_data_storage + self.external_sftp = external_sftp + self.external_api = external_api + self.description = description + self.properties = properties + + @classmethod + def get_instance(cls, message: dict) -> Thing: + try: + # external sftp and/or external api is optional + external_sftp = None + if "ext_sftp_settings" in message: + external_sftp = ExternalSFTP.get_instance(message["ext_sftp_settings"]) + external_api = None + if "ext_api_settings" in message: + external_api = ExternalApi.get_instance(message["ext_api_settings"]) + + return cls( + message["uuid"], + message["name"], + Project.get_instance(message["project"]), + Database.get_instance(message["database"]), + RawDataStorage.get_instance(message["raw_data_storage"]), + external_sftp, + external_api, + message["description"], + message["properties"], + ) + except KeyError as e: + raise ValueError( + f'Unable to get Thing instance from message "{message}"' + ) from e + + def setup_frost(self, tomcat_proxy_url: str): + frost.write_context_file( + schema=self.database.username.lower(), + user=f"sta_{self.database.ro_username.lower()}", + password=self.database.ro_password, + db_url=self.database.url, + tomcat_proxy_url=tomcat_proxy_url, + ) diff --git a/src/timeio/utils/__init__.py b/src/timeio/utils/__init__.py new file mode 100644 index 00000000..6d15ada9 --- /dev/null +++ b/src/timeio/utils/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 + +from .common import get_envvar, get_envvar_as_bool, setup_logging diff --git a/src/timeio/utils/cast.py b/src/timeio/utils/cast.py new file mode 100644 index 00000000..75159be4 --- /dev/null +++ b/src/timeio/utils/cast.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +from typing import Any, Iterable + + +def flatten_nested_str_list(obj: Any) -> list[str] | None: + """ + Flatten an arbitrary nested list of strings to a simple flat list. + + If any element is not a list and nor a string the function + return None. + + See also function `flatten()`. + """ + return flatten(obj, lambda e: isinstance(e, str), (list,)) + + +def flatten( + obj: Any, + element_test: callable = lambda x: True, + list_types: Iterable[type] = (list, tuple), +) -> list | None: + """ + Flatten an arbitrary nested list to a simple flat list. + If any non-list element fails the element_test the + function returns None. + + Example: + - flatten_nested_list('99') -> None + - flatten_nested_list(['99', [['la'], [666]]]) -> ['99', 'la', 666] + - flatten_nested_list(['99', [['la'], [666]]], lambda e: isinstance(e,str)) -> None + - flatten_nested_list(['99', [['la']]], lambda e: isinstance(e,str)) -> ['99', 'la'] + """ + flat = [] + if not isinstance(obj, tuple(list_types)): + return None + for elem in obj: + if isinstance(elem, tuple(list_types)): + if (li := flatten(elem, element_test, list_types)) is None: + return None + flat += li + elif element_test(elem): + flat.append(elem) + else: + return None + return flat diff --git a/src/timeio/utils/common.py b/src/timeio/utils/common.py new file mode 100644 index 00000000..3fd40696 --- /dev/null +++ b/src/timeio/utils/common.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import enum +import logging +import logging.config +import os +from typing import Any + +no_default = type("no_default", (), {}) + + +class ObservationResultType(enum.IntEnum): + Number = 0 + String = 1 + Json = 2 + Bool = 3 + + +def get_envvar(name, default: Any = no_default, cast_to: type = None, cast_None=True): + val = os.environ.get(name) + if val is None: + if default is no_default: + raise EnvironmentError(f"Missing environment variable {name!r}.") + return default + elif val == "None" and cast_None: + return None + elif cast_to is not None: + try: + if cast_to is bool: + return get_envvar_as_bool(name) + return cast_to(val) + except Exception: + raise TypeError( + f"Could not cast environment variable {name!r} " + f"to {cast_to}. Value: {val}" + ) from None + return val + + +def get_envvar_as_bool( + name, false_list=("no", "false", "0", "null", "none"), empty_is_False: bool = False +) -> bool: + """ + Return True if an environment variable is set and its value + is not in the false_list. + Return False if an environment variable is unset or if its value + is in the false_list. + + If 'empty_is_False' is True: + Same logic as above, but an empty string is considered False + + The false_list not case-sensitive. (faLsE == FALSE = false) + """ + val = os.environ.get(name, None) + if val is None: + return False + if val == "": + return not empty_is_False + return val.lower() not in false_list + + +def log_query(logger: logging.Logger, query: str, params: Any = no_default): + """ + Log a string(!) query. + + Note that this has no dependencies to any database package at all. + For a more detailed logging see: + - utils.psycopg_helper.log_psycopg_query + - utils.psycopg_helper.monkey_patch_psycopg_execute_to_log_sql_queries + """ + if not isinstance(query, str): + raise TypeError(f"query must be string not {type(query)}") + + args = [query] + if params is no_default: + args.append("--") + else: + args.append(params) + logger.debug(f"\n\tQUERY: %r\n\tPARAMS: %s", *args) + + +def setup_logging(log_level="INFO"): + """ + Setup logging. + + Globally setup logging according to utils.common.LOGGING_CONFIG + and set the log level of the root logger to the given level. + """ + + format = ( + "[%(asctime)s] %(process)s %(levelname)-6s %(name)s: %(funcName)s: %(message)s" + ) + logging.basicConfig( + level=log_level, + format=format, + datefmt="%Y-%m-%d %H:%M:%S", + ) diff --git a/src/timeio/utils/crypto.py b/src/timeio/utils/crypto.py new file mode 100644 index 00000000..66e495b3 --- /dev/null +++ b/src/timeio/utils/crypto.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +import os + +from cryptography.fernet import Fernet +from .common import get_envvar as _get_envvar + + +def get_crypt_key() -> str: + return _get_envvar("FERNET_ENCRYPTION_SECRET") + + +def encrypt(data: str, key: str) -> str: + return Fernet(key).encrypt(data.encode()).decode() + + +def decrypt(token: str, key: str): + return Fernet(key).decrypt(token).decode() diff --git a/src/timeio/utils/errors.py b/src/timeio/utils/errors.py new file mode 100644 index 00000000..82d4d996 --- /dev/null +++ b/src/timeio/utils/errors.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +from __future__ import annotations + + +class ParsingError(RuntimeError): + """Parsing failed.""" + + pass + + +class ProcessingError(RuntimeError): + """ + Processing failed due to + - bad system state (e.g. a service is not reachable, etc.) + - faulty implementation + """ + + pass + + +class ParsingWarning(RuntimeWarning): + """ + Report parsing issues not severe enough to abort the process. + """ + + pass + + +# ===================================================================== +# Errors that are handled gracefully in base_handler.AbstractHandler +# ===================================================================== + + +class DataNotFoundError(RuntimeError): + """ + Data is missing. + Handled gracefully in base_handler.AbstractHandler + """ + + pass + + +class NoDataWarning(RuntimeWarning): + """ + Data is not present. + Handled gracefully in base_handler.AbstractHandler + + Added in version 0.4.0 + """ + + pass + + +class UserInputError(ParsingError): + """ + Error that originated by malformed data or input provided by a user. + Handled gracefully in base_handler.AbstractHandler + + Added in version 0.6.0 + """ + + pass diff --git a/src/timeio/utils/journaling.py b/src/timeio/utils/journaling.py new file mode 100644 index 00000000..02666e8f --- /dev/null +++ b/src/timeio/utils/journaling.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import json +import logging +import warnings +from datetime import datetime, timezone +from http.client import HTTPResponse +from typing import Literal +from urllib import request +from urllib.error import HTTPError + +from .common import get_envvar, get_envvar_as_bool + +__all__ = ["Journal"] +logger = logging.getLogger("journaling") + + +class Journal: + def __init__( + self, + name: str, + errors: Literal["raise", "warn", "ignore"] = "raise", + ): + """ + Class to send messages to the user journal. + + The journal offers three modes of operation, which can be chosen by the + argument `errors`. + + For a systems where journaling is curial, use `errors='raise'` (default) + and every error is raised immediately. For example if the journal-endpoint + (database-API) is not reachable or if a thing-UUID does not exist an error + will be raised. It is the duty of the caller to handle the error accordingly. + + For a systems where journaling is less important, one can use `errors='warn'`, + which will cause the Journal to emit a warning in case of errors, but will + not disturb the program flow in other ways. + + If journaling is even less important one could use `errors='ignore'`, + which will cause the Journal to be silent in case of errors. + + Added in version 0.4.0 + """ + + if errors not in ["raise", "warn", "ignore"]: + raise ValueError('error must be one of ["raise", "warn", "ignore"]') + self.error_strategy = errors + self.name = name + self.enabled = get_envvar_as_bool("JOURNALING") + self.base_url = get_envvar("DB_API_BASE_URL", None) + + if not self.enabled: + warnings.warn( + "Journaling is disabled. To enable it, " + "set environment variables 'JOURNALING' " + "and 'DB_API_BASE_URL'", + RuntimeWarning, + stacklevel=2, + ) + return + + if self.base_url is None: + raise EnvironmentError( + "If JOURNALING is enabled, environment " + "variable DB_API_BASE_URL must be set." + ) + # check if up + with request.urlopen(f"{self.base_url}/health") as resp: + if resp.status != 200: + raise ConnectionError( + f"Failed to ping DB API '{self.base_url}/health'. " + f"HTTP status code: {resp.status}" + ) + + def info(self, message, thing_uuid): + self._to_journal("INFO", message, thing_uuid) + + def warning(self, message, thing_uuid): + self._to_journal("WARNING", message, thing_uuid) + + def error(self, message, thing_uuid): + self._to_journal("ERROR", message, thing_uuid) + + def _to_journal(self, level: str, message: str, thing_uuid): + if not self.enabled: + return + data = { + "timestamp": datetime.now(tz=timezone.utc).isoformat(), + "message": message, + "level": level, + "origin": self.name, + } + logger.info("Message to journal:\n>> %s[%s]: %s", self.name, level, message) + + req = request.Request( + url=f"{self.base_url}/journal/{thing_uuid}", + data=json.dumps(data).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + logger.debug(f"%s %s, data: %s", req.method, req.full_url, req.data) + + try: + resp: HTTPResponse = request.urlopen(req) + logger.debug("==> %s, %s", resp.status, resp.reason) + + except Exception as e: + if isinstance(e, HTTPError): + # HttpError is also an HTTPResponse object + logger.debug("==> %s, %s, %s", e.status, e.reason, e.read().decode()) + if self.error_strategy == "raise": + raise RuntimeError("Storing message to journal failed") from e + if self.error_strategy == "warn": + warnings.warn( + f"Storing message to journal failed, " + f"because of {type(e).__name__}: {e}", + RuntimeWarning, + stacklevel=3, + ) diff --git a/src/timeio/utils/psycopg_helper.py b/src/timeio/utils/psycopg_helper.py new file mode 100644 index 00000000..fcb5e5cb --- /dev/null +++ b/src/timeio/utils/psycopg_helper.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import logging as _logging +from functools import wraps as _wraps +from typing import Literal as _Literal +from typing import Union as _Union +from typing import cast as _cast +from warnings import warn as _warn + +try: + import psycopg as _psycopg + from psycopg import sql as _sql +except ImportError as e: + raise EnvironmentError( + f"To use the module {__package__}.{__name__} psycopg 3 is needed." + f"Install it with 'pip install psycopg[binary]'" + ) + +# Keep the original function, in case the monkey-patch +# is executed multiple times. +__psycopgCursorExecute = _psycopg.Cursor.execute +__patched = False +_ConOrCurT = _Union[_psycopg.Connection, _psycopg.Cursor] + + +def log_psycopg_query( + logger: _logging.Logger, + conn_or_cur: _ConOrCurT, + query, + params=None, + log_level=_logging.DEBUG, +): + msg, q = None, query + if isinstance(q, str): + q = _sql.SQL(_cast(_Literal, q.strip())) + if isinstance(q, _sql.Composable): + msg = f"\n\tQUERY: %r\n\tPARAMS: %s" + log_args = [q.as_string(conn_or_cur)] + if params is None: + log_args.append("--") + else: + log_args.append(params) + logger.log(log_level, msg, *log_args) + + +def monkey_patch_psycopg_execute_to_log_sql_queries( + logger: _logging.Logger, log_level: int | str = "DEBUG" +): + if isinstance(log_level, str): + log_level = _logging.getLevelName(log_level) + if not isinstance(log_level, int): + raise ValueError( + f"log_level must be an integer, or one of the default string levels, " + f"not {log_level}" + ) + + global __patched + if __patched: + _warn( + "monkey_patch_psycopg_execute_to_log_sql_queries() " + "should only called once, to monkey-patch 'psycopg.Cursor.execute'. ", + stacklevel=2, + ) + else: + __patched = True + + def patch(func: callable) -> callable: + @_wraps(func) + def wrapper(self: _ConOrCurT, query, params=None, **kwargs): + # guard prevents unnecessary processing + if logger.isEnabledFor(log_level): + log_psycopg_query(logger, self, query, params, log_level=log_level) + return func(self, query, params, **kwargs) + + return wrapper + + _psycopg.Cursor.execute = patch(__psycopgCursorExecute) diff --git a/src/timeio/utils/types.py b/src/timeio/utils/types.py new file mode 100644 index 00000000..2ba84af4 --- /dev/null +++ b/src/timeio/utils/types.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import datetime +import typing as _t + +JsonScalarT = _t.Union[str, int, float, bool, None] +JsonArrayT = list["JsonT"] +JsonObjectT = dict[str, "JsonT"] +JsonT = _t.Union[JsonScalarT, JsonArrayT, JsonObjectT] + +DbScalarT = _t.Union[str, bool, int, float, JsonT, datetime.datetime.timestamp] +DbRowT = tuple[DbScalarT, ...] + + +class MqttPayload: + + class QaqcConfigV1_T(_t.TypedDict): + version: _t.Literal[1] + name: str + project_uuid: str + context_window: str + tests: list[MqttPayload.QaqcTestT] + + class QaqcConfigV2_T(_t.TypedDict): + version: _t.Literal[2] + name: str + project_uuid: str + context_window: str + functions: list[MqttPayload.QaqcFunctionT] + + class QaqcTestT(_t.TypedDict, total=True): + function: str + kwargs: dict[str, _t.Any] + position: int + + class QaqcFunctionT(_t.TypedDict, total=True): + name: str + func_id: str + kwargs: dict[str, _t.Any] + datastreams: list[MqttPayload.QaqcFuncStreamsT] + + class QaqcFuncStreamsT(_t.TypedDict): + arg_name: str + thing_sta_id: int | None + sta_stream_id: int | None + + +class ConfDB: + + class DatabaseT(_t.TypedDict): + id: int + schema: str + user: str + password: str + ro_user: str + re_password: str + + class ExtApiT(_t.TypedDict): + id: int + api_type_id: int + sync_interval: int + sync_enabled: bool + settings: JsonT | None + + class ExtApiTypeT(_t.TypedDict): + id: int + name: str + + class ExtSFTP_T(_t.TypedDict): + id: int + uri: str + path: str + user: str + password: str | None + ssh_priv_key: int + ssh_pub_key: int + sync_interval: int + sync_enabled: bool + + class FileParserT(_t.TypedDict): + id: int + file_parser_type_id: int + name: str + params: JsonT | None + + class FileParserTypeT(_t.TypedDict): + id: int + name: str + + class IngestTypeT(_t.TypedDict): + id: int + name: str + + class MqttT(_t.TypedDict): + id: int + user: str + password: str + password_hashed: str + topic: str | None + mqtt_device_type_id: int | None + + class MqttDeviceTypeT(_t.TypedDict): + id: int + name: str + + class ProjectT(_t.TypedDict): + id: int + name: str + uuid: str + database_id: int + + class QaqcT(_t.TypedDict): + id: int + name: str + project_id: int + context_window: str + + class QaqcTestT(_t.TypedDict): + id: int + qaqc_id: int + function: str + args: JsonT | None + position: int | None + name: str | None + streams: list[ConfDB.QaqcTestStreamT] | None + + class QaqcTestStreamT(_t.TypedDict): + arg_name: str + sta_thing_id: int | None + sta_stream_id: int | None + + class S3_StoreT(_t.TypedDict): + id: int + user: str + password: str + bucket: str + filename_pattern: str | None + file_parser_id: int + + class ThingT(_t.TypedDict): + id: int + uuid: int + name: str + project_id: int + ingest_type_id: int + s3_store_id: int + mqtt_id: int + ext_sftp_id: int | None + ext_api_id: int | None diff --git a/src/timeio/version.py b/src/timeio/version.py new file mode 100644 index 00000000..906d362f --- /dev/null +++ b/src/timeio/version.py @@ -0,0 +1 @@ +__version__ = "0.6.0" -- GitLab