|
""" |
|
Functionality to convert tabular data in Frictionless Data Packages to CSVW. |
|
|
|
We translate [table schemas](https://specs.frictionlessdata.io/table-schema/) defined |
|
for [data resources](https://specs.frictionlessdata.io/data-resource/) in a |
|
[data package](https://specs.frictionlessdata.io/data-package/) to a CVSW TableGroup. |
|
|
|
This functionality can be used together with the `frictionless describe` command to add |
|
CSVW metadata to "raw" CSV tables. |
|
""" |
|
import json |
|
import pathlib |
|
|
|
|
|
def convert_column_spec(spec): |
|
""" |
|
https://specs.frictionlessdata.io/table-schema/#field-descriptors |
|
|
|
:param spec: |
|
:return: |
|
""" |
|
typemap = { |
|
'year': 'gYear', |
|
'yearmonth': 'gYearMonth', |
|
} |
|
|
|
titles = [t for t in [spec.get('title')] if t] |
|
|
|
res = {'name': spec['name'], 'datatype': {'base': 'string'}} |
|
if 'type' in spec: |
|
if spec['type'] == 'string' and spec.get('format') == 'binary': |
|
res['datatype']['base'] = 'binary' |
|
elif spec['type'] == 'string' and spec.get('format') == 'uri': |
|
res['datatype']['base'] = 'anyURI' |
|
elif spec['type'] in typemap: |
|
res['datatype']['base'] = typemap[spec['type']] |
|
elif spec['type'] in [ |
|
'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration', |
|
]: |
|
res['datatype']['base'] = spec['type'] |
|
if spec['type'] == 'string' and spec.get('format'): |
|
res['datatype']['dc:format'] = spec['format'] |
|
if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'): |
|
res['datatype']['format'] = '{}|{}'.format( |
|
spec['trueValues'][0], spec['falseValues'][0]) |
|
if spec['type'] in ['number', 'integer']: |
|
if spec.get('bareNumber') is True: |
|
raise NotImplementedError( |
|
'bareNumber is not supported in CSVW. It may be possible to translate to ' |
|
'a number pattern, though. See ' |
|
'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/' |
|
'#formats-for-numeric-types') |
|
if any(prop in spec for prop in ['decimalChar', 'groupChar']): |
|
res['datatype']['format'] = {} |
|
for p in ['decimalChar', 'groupChar']: |
|
if spec.get(p): |
|
res['datatype']['format'][p] = spec[p] |
|
elif spec['type'] in ['object', 'array']: |
|
res['datatype']['base'] = 'json' |
|
res['datatype']['dc:format'] = 'application/json' |
|
elif spec['type'] == 'geojson': |
|
res['datatype']['base'] = 'json' |
|
res['datatype']['dc:format'] = 'application/geo+json' |
|
|
|
if titles: |
|
res['titles'] = titles |
|
if 'description' in spec: |
|
res['dc:description'] = [spec['description']] |
|
if 'rdfType' in spec: |
|
res['propertyUrl'] = spec['rdfType'] |
|
|
|
constraints = spec.get('constraints', {}) |
|
for prop in ['required', 'minLength', 'maxLength', 'minimum', 'maximum']: |
|
if prop in constraints: |
|
res['datatype'][prop] = constraints[prop] |
|
if ('pattern' in constraints) and ('format' not in res['datatype']): |
|
res['datatype']['format'] = constraints['pattern'] |
|
|
|
|
|
return res |
|
|
|
|
|
def convert_foreignKey(rsc_name, fk, resource_map): |
|
""" |
|
https://specs.frictionlessdata.io/table-schema/#foreign-keys |
|
""" |
|
|
|
|
|
return dict( |
|
columnReference=fk['fields'], |
|
reference=dict( |
|
columnReference=fk['reference']['fields'], |
|
resource=resource_map[fk['reference']['resource'] or rsc_name], |
|
) |
|
) |
|
|
|
|
|
def convert_table_schema(rsc_name, schema, resource_map): |
|
""" |
|
:param rsc_name: `name` property of the resource the schema belongs to. Needed to resolve \ |
|
self-referential foreign keys. |
|
:param schema: `dict` parsed from JSON representing a frictionless Table Schema object. |
|
:param resource_map: `dict` mapping resource names to resource paths, needed to convert foreign\ |
|
key constraints. |
|
:return: `dict` suitable for instantiating a `csvw.metadata.Schema` object. |
|
""" |
|
res = dict( |
|
columns=[convert_column_spec(f) for f in schema['fields']], |
|
) |
|
for prop in [ |
|
('missingValues', 'null'), |
|
'primaryKey', |
|
'foreignKeys', |
|
]: |
|
if isinstance(prop, tuple): |
|
prop, toprop = prop |
|
else: |
|
toprop = prop |
|
if prop in schema: |
|
res[toprop] = schema[prop] |
|
if prop == 'foreignKeys': |
|
res[toprop] = [convert_foreignKey(rsc_name, fk, resource_map) for fk in res[toprop]] |
|
return res |
|
|
|
|
|
def convert_dialect(rsc): |
|
""" |
|
Limitations: lineTerminator is not supported. |
|
|
|
https://specs.frictionlessdata.io/csv-dialect/ |
|
""" |
|
d = rsc.get('dialect', {}) |
|
|
|
if 'csv' in d: |
|
d = d['csv'] |
|
res = {} |
|
if d.get('delimiter'): |
|
res['delimiter'] = d['delimiter'] |
|
if rsc.get('encoding'): |
|
res['encoding'] = rsc['encoding'] |
|
for prop in [ |
|
'delimiter', |
|
'quoteChar', |
|
'doubleQuote', |
|
'skipInitialSpace', |
|
'header', |
|
]: |
|
if prop in d: |
|
res[prop] = d[prop] |
|
if 'commentChar' in d: |
|
res['commentPrefix'] = d['commentChar'] |
|
return res |
|
|
|
|
|
class DataPackage: |
|
def __init__(self, spec, directory=None): |
|
if isinstance(spec, DataPackage): |
|
self.json = spec.json |
|
self.dir = spec.dir |
|
return |
|
if isinstance(spec, dict): |
|
|
|
self.dir = pathlib.Path(directory or '.') |
|
elif isinstance(spec, pathlib.Path): |
|
self.dir = directory or spec.parent |
|
spec = json.loads(spec.read_text(encoding='utf8')) |
|
else: |
|
spec = json.loads(spec) |
|
self.dir = pathlib.Path(directory or '.') |
|
|
|
self.json = spec |
|
|
|
def to_tablegroup(self, cls=None): |
|
from csvw import TableGroup |
|
|
|
md = {'@context': "http://www.w3.org/ns/csvw"} |
|
|
|
md['dc:replaces'] = json.dumps(self.json) |
|
|
|
|
|
|
|
|
|
for flprop, csvwprop in [ |
|
('id', 'dc:identifier'), |
|
('licenses', 'dc:license'), |
|
('title', 'dc:title'), |
|
('homepage', 'dcat:accessURL'), |
|
('description', 'dc:description'), |
|
('sources', 'dc:source'), |
|
('contributors', 'dc:contributor'), |
|
('profile', 'dc:conformsTo'), |
|
('keywords', 'dc:subject'), |
|
('created', 'dc:created'), |
|
]: |
|
if flprop in self.json: |
|
md[csvwprop] = self.json[flprop] |
|
|
|
if 'name' in self.json: |
|
if 'id' not in self.json: |
|
md['dc:identifier'] = self.json['name'] |
|
elif 'title' not in self.json: |
|
md['dc:title'] = self.json['name'] |
|
|
|
|
|
resources = [rsc for rsc in self.json.get('resources', []) if 'path' in rsc] |
|
resource_map = {rsc['name']: rsc['path'] for rsc in resources if 'name' in rsc} |
|
for rsc in resources: |
|
schema = rsc.get('schema') |
|
if schema and \ |
|
rsc.get('scheme') == 'file' and \ |
|
rsc.get('format') == 'csv': |
|
|
|
md.setdefault('tables', []) |
|
table = dict( |
|
url=rsc['path'], |
|
tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map), |
|
dialect=convert_dialect(rsc), |
|
) |
|
md['tables'].append(table) |
|
|
|
cls = cls or TableGroup |
|
res = cls.fromvalue(md) |
|
res._fname = self.dir / 'csvw-metadata.json' |
|
return res |
|
|