Source code for pavics_datacatalog.wps_processes.wps_pavicrawler

import os
import time
import traceback
import json
from pywps import Process, get_format, configuration
from pywps import LiteralInput, ComplexOutput

from pavics import catalog

# Example usage:
# localhost/pywps?service=WPS&request=execute&version=1.0.0&\
# identifier=pavicrawler&storeExecuteResponse=true&status=true&DataInputs=

# Current behaviour: values in the NetCDF files take precedence over the
# values in the Solr database. This could be an option as an input...

# The list of metadata to scan should be in a config file, let's input
# it manually for now:
my_facets = ['experiment', 'frequency', 'institute', 'model', 'project']
# variable, variable_long_name and cf_standard_name, are not necessarily
# in the global attributes, need to come back for this later...

# The user under which apache is running must be able to write to that
# directory.
output_path = configuration.get_config_value('server', 'outputpath')

json_format = get_format('JSON')
gmlxml_format = get_format('GML')
text_format = get_format('TEXT')


[docs]class PavicsCrawler(Process):
    def __init__(self):
        self.solr_server = os.environ.get('SOLR_HOST', None)
        env_thredds_host = os.environ.get('THREDDS_HOST', '')
        self.wms_alternate_server = os.environ.get(
            'WMS_ALTERNATE_SERVER', None)
        self.thredds_servers = map(str.strip, env_thredds_host.split(','))

        inputs = [LiteralInput('target_files',
                               'Files to crawl',
                               abstract=('Only those file names will be '
                                         'crawled.'),
                               data_type='string',
                               min_occurs=0,
                               max_occurs=10000),
                  LiteralInput('target_thredds',
                               'Thredds server to scan',
                               abstract='Thredds server to scan.',
                               data_type='string',
                               min_occurs=0)]
        outputs = [ComplexOutput('crawler_result',
                                 'PAVICS Crawler Result',
                                 abstract='Crawler result as a json.',
                                 supported_formats=[json_format],
                                 as_reference=True)]

        super(PavicsCrawler, self).__init__(
            self._handler,
            identifier='pavicrawler',
            title='PAVICS Crawler',
            abstract=('Crawl thredds server and write metadata to SOLR '
                      'database.'),
            version='0.1',
            inputs=inputs,
            outputs=outputs,
            store_supported=True,
            status_supported=True)

    def _handler(self, request, response):
        if 'target_files' in request.inputs:
            target_files = []
            for i in range(len(request.inputs['target_files'])):
                target_files.append(request.inputs['target_files'][i].data)
        else:
            target_files = None

        # If a target thredds server is specified, it must be in the list
        # of thredds servers from the config, otherwise we fall back to
        # scanning all thredds servers.
        # Suggestion: decompose the target_thredds and compare individual
        # sections of the url/port to allow more flexibility in the
        # comparison.
        if ('target_thredds' in request.inputs) and \
           (request.inputs['target_thredds'][0].data in self.thredds_servers):
            target_thredds_servers = [request.inputs['target_thredds'][0].data]
        else:
            target_thredds_servers = self.thredds_servers

        try:
            for thredds_server in target_thredds_servers:
                if (self.wms_alternate_server is not None) and \
                   ('<HOST>' in self.wms_alternate_server):
                    wms_with_host = self.wms_alternate_server.replace(
                        '<HOST>', thredds_server.split('/')[2].split(':')[0])
                else:
                    wms_with_host = self.wms_alternate_server
                update_result = catalog.pavicrawler(
                    thredds_server, self.solr_server, my_facets,
                    set_dataset_id=True, wms_alternate_server=wms_with_host,
                    target_files=target_files)
        except:
            raise Exception(traceback.format_exc())

        # Here we construct a unique filename
        time_str = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
        output_file_name = "solr_result_{0}_.json".format(time_str)
        output_file = os.path.join(output_path, output_file_name)
        f1 = open(output_file, 'w')
        f1.write(json.dumps(update_result))
        f1.close()
        response.outputs['crawler_result'].file = output_file
        response.outputs['crawler_result'].output_format = json_format
        return response