Source code for pavics_datacatalog.wps_processes.wps_pavicrawler

import os
import time
import traceback
import json
from pywps import Process, get_format, configuration
from pywps import LiteralInput, ComplexOutput

from pavics import catalog

# Example usage:
# localhost/pywps?service=WPS&request=execute&version=1.0.0&\
# identifier=pavicrawler&storeExecuteResponse=true&status=true&DataInputs=

# Current behaviour: values in the NetCDF files take precedence over the
# values in the Solr database. This could be an option as an input...

# The list of metadata to scan should be in a config file, let's input
# it manually for now:
my_facets = ['experiment', 'frequency', 'institute', 'model', 'project']
# variable, variable_long_name and cf_standard_name, are not necessarily
# in the global attributes, need to come back for this later...

# The user under which apache is running must be able to write to that
# directory.
output_path = configuration.get_config_value('server', 'outputpath')

json_format = get_format('JSON')
gmlxml_format = get_format('GML')
text_format = get_format('TEXT')


[docs]class PavicsCrawler(Process): def __init__(self): self.solr_server = os.environ.get('SOLR_HOST', None) env_thredds_host = os.environ.get('THREDDS_HOST', '') self.wms_alternate_server = os.environ.get( 'WMS_ALTERNATE_SERVER', None) self.thredds_servers = map(str.strip, env_thredds_host.split(',')) inputs = [LiteralInput('target_files', 'Files to crawl', abstract=('Only those file names will be ' 'crawled.'), data_type='string', min_occurs=0, max_occurs=10000), LiteralInput('target_thredds', 'Thredds server to scan', abstract='Thredds server to scan.', data_type='string', min_occurs=0)] outputs = [ComplexOutput('crawler_result', 'PAVICS Crawler Result', abstract='Crawler result as a json.', supported_formats=[json_format], as_reference=True)] super(PavicsCrawler, self).__init__( self._handler, identifier='pavicrawler', title='PAVICS Crawler', abstract=('Crawl thredds server and write metadata to SOLR ' 'database.'), version='0.1', inputs=inputs, outputs=outputs, store_supported=True, status_supported=True) def _handler(self, request, response): if 'target_files' in request.inputs: target_files = [] for i in range(len(request.inputs['target_files'])): target_files.append(request.inputs['target_files'][i].data) else: target_files = None # If a target thredds server is specified, it must be in the list # of thredds servers from the config, otherwise we fall back to # scanning all thredds servers. # Suggestion: decompose the target_thredds and compare individual # sections of the url/port to allow more flexibility in the # comparison. if ('target_thredds' in request.inputs) and \ (request.inputs['target_thredds'][0].data in self.thredds_servers): target_thredds_servers = [request.inputs['target_thredds'][0].data] else: target_thredds_servers = self.thredds_servers try: for thredds_server in target_thredds_servers: if (self.wms_alternate_server is not None) and \ ('<HOST>' in self.wms_alternate_server): wms_with_host = self.wms_alternate_server.replace( '<HOST>', thredds_server.split('/')[2].split(':')[0]) else: wms_with_host = self.wms_alternate_server update_result = catalog.pavicrawler( thredds_server, self.solr_server, my_facets, set_dataset_id=True, wms_alternate_server=wms_with_host, target_files=target_files) except: raise Exception(traceback.format_exc()) # Here we construct a unique filename time_str = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) output_file_name = "solr_result_{0}_.json".format(time_str) output_file = os.path.join(output_path, output_file_name) f1 = open(output_file, 'w') f1.write(json.dumps(update_result)) f1.close() response.outputs['crawler_result'].file = output_file response.outputs['crawler_result'].output_format = json_format return response