Skip to content
Snippets Groups Projects
Commit 5bea4f9e authored by Jens Bröder's avatar Jens Bröder
Browse files

Read last run from config and parse it through to harvester methods

parent 541f54e0
No related branches found
No related tags found
No related merge requests found
......@@ -6,8 +6,12 @@ metadata:
name: "Helmholtz Knowledge Graph config"
description: "This file defines from which sources the harvester harvest and "
AllHarvesters:
last_run: 2022-12-18
# Pipeline source and configs
GitHarvester:
last_run: 2022-12-18
sources:
AWI:
- name: git_awi
......
......@@ -41,6 +41,11 @@ class Harvester():
# This is the harvester specific part in the config
self.config = full_config.get(self.__class__.__name__, {})
self.sources = self.config.get('sources', [])
last_run = self.config.get('last_run', None)
if last_run is None:
all_harvester_conf = full_config.get('AllHarvesters', {})
last_run = all_harvester_conf.get('last_run', None)
self.last_run = last_run
def get_sources(self):
"""Return sources"""
......
......@@ -214,16 +214,19 @@ class DataciteHarvester(Harvester):
fail = []
if base_savepath is None:
base_savepath = self.outpath
if since is None:
since = self.last_run
roars = self.get_roars()
if centers == 'all':
for key, val in roars.items():
print(f'Harvesting Center {key}')
roar = val['roar']
base_savepath1 = base_savepath / key
suc, fai = harvest_roar(roar, base_savepath=base_savepath1)
suc, fai = harvest_roar(roar, base_savepath=base_savepath1, since=since)
fail.extend(fai)
else:
roar = roars['center']['roar']
base_savepath = base_savepath / roars['center']
suc, fail = harvest_roar(roar, base_savepath=base_savepath)
suc, fail = harvest_roar(roar, base_savepath=base_savepath, since=since)
print(fail)
......@@ -261,7 +261,7 @@ def harvest_hgf_gitlabs(name: str,
def request_gitlab_projects(
name: str, base_savepath: Optional[Path] = Path('.'), gitlabs_meta=load_gitlabs_meta()) -> None:
name: str, base_savepath: Optional[Path] = Path('.'), gitlabs_meta=load_gitlabs_meta(), since=None) -> None:
"""Get all projects from API for a given HGF center"""
if base_savepath is None:
......@@ -270,6 +270,7 @@ def request_gitlab_projects(
base_link = entry['url']
git_name = entry['name']
base_savepath_p = base_savepath / f'gitlab_project_jsons/{git_name}'
# fixme: build since into the gitlab API request, to get only projects updated since
link = f'{base_link}/api/v4/projects?pagination=keyset&non_archived=true&page=1&sort=desc&order_by=id&visibility_level=20&per_page=100'
os.makedirs(base_savepath_p, exist_ok=True)
......@@ -314,11 +315,14 @@ class GitHarvester(Harvester):
if base_savepath is None:
base_savepath = self.outpath
if since is None:
since = self.last_run
gitlabs_r = self.get_gitlabs()
if hgf_name == 'all':
threads: List[threading.Thread] = []
for center, val in gitlabs_r.items():
request_gitlab_projects(center, base_savepath=base_savepath, gitlabs_meta=gitlabs_r)
request_gitlab_projects(center, base_savepath=base_savepath, gitlabs_meta=gitlabs_r, since=since)
if use_threading:
# for now one thread per center
thread = threading.Thread(target=harvest_hgf_gitlabs,
......@@ -336,7 +340,7 @@ class GitHarvester(Harvester):
for thread in threads:
thread.join()
else:
request_gitlab_projects(center, base_savepath=base_savepath, gitlabs_meta=gitlabs_r)
request_gitlab_projects(center, base_savepath=base_savepath, gitlabs_meta=gitlabs_r, since=since)
harvest_hgf_gitlabs(hgf_name, base_savepath=base_savepath, gitlabs_meta=gitlabs_r)
......
......@@ -223,6 +223,9 @@ class SitemapHarvester(Harvester):
if base_savepath is None:
base_savepath = self.outpath
if since is None:
since = self.last_run
sitemaps_r = self.get_sitemaps()
if sitemap == 'all':
for sitem in sitemaps_r:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment