Source code for jsonschema_pyref

import json
import os.path as pth
import pkgutil
from functools import lru_cache
from importlib import import_module
from typing import Any, Callable, List, Optional, Type, Union
from urllib.parse import (ParseResult, urljoin as _urllib_urljoin,
                          urlparse as _urllib_urlparse)

import jsonschema  # type: ignore
from jsonschema import RefResolver as _RefResolver  # type: ignore
from jsonschema import ValidationError  # type: ignore

try:
    import yaml  # type: ignore
except ImportError:
    yaml = None


__version__ = '0.1.1.dev0'


__all__ = ['URL_SCHEME_RESOLVERS', 'RefResolver', 'ValidationError', 'urljoin',
           'urlparse', 'validate']


_JSONTypes = Union[dict, list, str, int, float, bool, None]
_SchemaTypes = Union[dict, bool]


[docs]def validate(document: _JSONTypes, schema: _SchemaTypes,
             cls: Optional[Type] = None, *args: Any, **kwargs: Any):
    """
    Drop-in replacement for `jsonschema.validate` which uses the customized
    `jsonschema_pyref.RefResolver` by default.

    >>> from jsonschema_pyref import validate
    >>> schema = {
    ...     'properties': {
    ...         'a': {'$ref': 'py-obj:jsonschema_pyref.examples.schema1'}
    ...     }
    ... }
    ...
    >>> validate({'a': 'hello'}, schema) is None
    True
    """

    if 'resolver' not in kwargs:
        kwargs['resolver'] = RefResolver.from_schema(schema)

    return jsonschema.validate(document, schema, cls=cls, *args, **kwargs)


[docs]class RefResolver(_RefResolver):
    """
    Drop-in replacement for `jsonschema.RefResolver` that can resolve
    ``py-obj`` and ``py-pkgdata`` URLs.

    The easiest way to use it is to initialize it using the
    `~jsonschema.RefResolver.from_schema` method and pass it as a keyword
    argument to `jsonschema.validate`:

    >>> import jsonschema
    >>> from jsonschema_pyref import RefResolver
    >>> schema = {
    ...     'properties': {
    ...         'a': {'$ref': 'py-obj:jsonschema_pyref.examples.schema1'}
    ...     }
    ... }
    ...
    >>> resolver = RefResolver.from_schema(schema)
    >>> jsonschema.validate({'a': 'hello'}, schema, resolver=resolver) is None
    True
    """

    def __init__(self, base_uri: str, referrer: _JSONTypes,
                 store: Union[dict, tuple] = (),
                 cache_remote: bool = True,
                 handlers: Union[dict, tuple] = (),
                 urljoin_cache: Optional[Callable[[str, str], str]] = None,
                 remote_cache: Optional[Callable[[str], dict]] = None) -> None:
        base_handlers = dict(URL_SCHEME_RESOLVERS)
        # Users may still override the builtin handlers for py-obj and
        # py-pkgdata as well as any others
        base_handlers.update(handlers)
        handlers = base_handlers

        # Same for urljoin_cache
        if urljoin_cache is None:
            urljoin_cache = lru_cache(1024)(urljoin)

        super().__init__(base_uri, referrer, store=store,
                         cache_remote=cache_remote,
                         handlers=handlers, urljoin_cache=urljoin_cache,
                         remote_cache=remote_cache)


[docs]def urljoin(base: str, url: str, allow_fragments: bool = True) -> str:
    """
    Extends the stdlib's `urllib.parse.urljoin` to support the custom URL
    schemes implemented by this package.

    The join semantics for ``py-obj`` and ``py-pkgdata`` URLs differ from
    those of typical http-like URLs, as well as from each other.

    The rules for ``py-obj`` are as follows:

    * If joined with a "relative" URL, if the URL starts with a Python
      identifier, it is simply appended to the current object path:

      >>> from jsonschema_pyref import urljoin
      >>> urljoin('py-obj:a.b.c', 'd.e')
      'py-obj:a.b.c.d.e'

    * However, if the "relative" URL starts with one or more ``.``, it
      is processed relative to the base path using similar semantics to
      Python relative imports.  That is, one dot means a different attribute
      of the same object, and so on:

      >>> urljoin('py-obj:a.b.c', '.d')
      'py-obj:a.b.d'
      >>> urljoin('py-obj:a.b.c', '..d')
      'py-obj:a.d'
      >>> urljoin('py-obj:a.b.c', '..d.e')
      'py-obj:a.d.e'

    * When joining an absolute URL with the ``py-obj`` scheme, the new URL
      replaces the base:

      >>> urljoin('py-obj:a.b.c', 'py-obj:d.e.f')
      'py-obj:d.e.f'

    * And likewise when joining an absolute URL with an entirely different
      scheme:

      >>> urljoin('py-obj:a.b.c', 'https://example.com')
      'https://example.com'

    * Fragments are dropped from the base URL, and retained from the joined
      URL:

      >>> urljoin('py-obj:a.b.c#replaced', '.d#fragment')
      'py-obj:a.b.d#fragment'
      >>> urljoin('py-obj:a.b.c#replaced', '#fragment')
      'py-obj:a.b.c#fragment'

    The rules for ``py-pkgdata`` are as follows:

    * If joined to a relative URL, it is treated as a different file relative
      to the same package/directory as the base URL, very similarly to how
      `urllib.parse.urljoin` works for relative paths joined to http(s) URLs:

      >>> urljoin('py-pkgdata:a.b.c/schema1.json', 'schema2.json')
      'py-pkgdata:a.b.c/schema2.json'
      >>> urljoin('py-pkgdata:a.b.c/schemas/schema1.json', 'schema2.json')
      'py-pkgdata:a.b.c/schemas/schema2.json'
      >>> urljoin('py-pkgdata:a.b.c/schemas/sub/schema1.json', 'schema2.json')
      'py-pkgdata:a.b.c/schemas/sub/schema2.json'
      >>> urljoin('py-pkgdata:a.b.c/schemas/schema1.json', '../more_schemas/schema2.json')
      'py-pkgdata:a.b.c/more_schemas/schema2.json'

      .. note::

          This does not support relative URLs with a package-relative
          component, only different paths within the same package.  Maybe
          support for this could be added in the future, but under the current
          format it's too ambiguous.

    * When joining an absolute URL, if the joined URL is in the same package
      it works the same as the relative case (effectively the joined URL
      replaces the base):

      >>> urljoin('py-pkgdata:a.b.c/schema1.json', 'py-pkgdata:a.b.c/schema2.json')
      'py-pkgdata:a.b.c/schema2.json'

    * Likewise when they are different packages:

      >>> urljoin('py-pkgdata:a.b.c/schema1.json', 'py-pkgdata:d.e.f/schema2.json')
      'py-pkgdata:d.e.f/schema2.json'

    * Or if they are the exact same URL, that URL is returned:

      >>> urljoin('py-pkgdata:a.b.c/sub/schema1.json', 'py-pkgdata:a.b.c/sub/schema1.json')
      'py-pkgdata:a.b.c/sub/schema1.json'

    * And likewise if they are not the same scheme at all:

      >>> urljoin('py-pkgdata:a.b.c/schema1.json', 'http://example.com/schema2.json')
      'http://example.com/schema2.json'

    * The same rules apply for fragments as with ``py-obj``:

      >>> urljoin('py-pkgdata:a.b.c/schema1.json#replaced', 'schema2.json#fragment')
      'py-pkgdata:a.b.c/schema2.json#fragment'
      >>> urljoin('py-pkgdata:a.b.c/schema1.json#replaced', '#fragment')
      'py-pkgdata:a.b.c/schema1.json#fragment'

    """
    parsedb = urlparse(base)
    if parsedb.scheme in _URL_SCHEME_JOINERS:
        return _URL_SCHEME_JOINERS[parsedb.scheme](base, url)

    return _urllib_urljoin(base, url, allow_fragments=allow_fragments)


[docs]def urlparse(url: str) -> ParseResult:
    """
    Extends the stdlib's `urllib.parse.urlparse` to support the custom URL
    schemes implemented by this package.
    """

    if ':' in url:
        scheme, _ = url.split(':', 1)
        if scheme in _URL_SCHEME_PARSERS:
            return _URL_SCHEME_PARSERS[scheme](url)

    return _urllib_urlparse(url)


def _is_dotted_identifier(s: str) -> bool:
    """
    Return True if string is one or more valid Python identifiers separated
    by ``.``.
    """

    s = s.strip()
    if not s:
        return False

    for ident in s.split('.'):
        if not ident.isidentifier():
            return False

    return True


def _resolve_url_py_obj(url: str) -> dict:
    parsed = _urlparse_py_obj(url)
    path = parsed.path
    modname = path
    objpath: List[str] = []
    module = None

    while module is None:
        try:
            module = import_module(modname)
        except ImportError:
            if '.' in path:
                # At least part of the dotted name is the name of the object
                # in the module, and not the module itself.
                modname, objname = modname.rsplit('.', 1)
                objpath.insert(0, objname)
            else:
                raise

    obj = module
    for idx, attr in enumerate(objpath):
        try:
            obj = getattr(obj, attr)
        except Exception:
            raise AttributeError(
                f"no attribute '{attr}' on "
                f"{'.'.join([modname] + objpath[:idx])}")

    if not isinstance(obj, dict):
        raise ValueError(
            f'path {path} does not resolve to a dict, which it must be in '
            f'order to be a JSON Schema')

    return obj


def _resolve_url_py_pkgdata(url: str) -> dict:
    parsed = _urlparse_py_pkgdata(url)
    path = parsed.path
    pkgname, filename = path.split('/', 1)

    _, ext = pth.splitext(filename)

    # TODO: Support more file types and a pluggable registry of loaders for
    # different types--as long as it can be loaded into a dict it can be
    # supported
    yaml_exts = {'.yaml', '.yml'}
    json_exts = {'.json'}
    supported_exts = yaml_exts | json_exts
    data = pkgutil.get_data(pkgname, filename)

    if data is None:
        raise RuntimeError(
            f'no resource named {filename} could be loaded from package '
            f'{pkgname}; see https://docs.python.org/3/library/pkgutil.html#pkgutil.get_data')

    if ext in yaml_exts:
        if yaml is None:
            raise RuntimeError(
                'reading schemas from YAML files requires PyYAML to be '
                'installed: https://pypi.org/project/PyYAML/')

        return yaml.safe_load(data)
    elif ext in json_exts:
        return json.loads(data)
    else:
        supported_exts = yaml_exts | json_exts
        raise ValueError(
            f'schemas are currently only loadable from files with the '
            f'following filename extensions: {sorted(supported_exts)}')


def _urljoin_py_obj(base: str, url: str) -> str:
    parsedb = _urlparse_py_obj(base)
    parsedu = urlparse(url)

    if parsedu.scheme:
        return url

    rel_depth = 0
    pathu = parsedu.path

    while pathu and pathu[0] == '.':
        rel_depth += 1
        pathu = pathu[1:]

    pathb = parsedb.path
    pathb_parts = pathb.split('.')

    if rel_depth > len(pathb_parts):
        raise ValueError(
            f'relative object path {url} outside the original path {base}')

    if rel_depth > 0:
        pathb = '.'.join(pathb_parts[:-rel_depth])

    new_url = 'py-obj:' + pathb

    if pathu:
        new_url += '.' + pathu

    if parsedu.fragment:
        new_url += '#' + parsedu.fragment

    return new_url


def _urljoin_py_pkgdata(base: str, url: str) -> str:
    # This follows similar semantics to normal URL joins w.r.t. the filename
    # path, so we actually pass this to urljoin but without the scheme (which
    # it doesn't recognize)
    parsedb = _urlparse_py_pkgdata(base)
    parsedu = urlparse(url)

    # If url is an absolute url with a scheme, what we do depends on the
    # scheme:
    # * If also py-pkgdata, check if the package names match; if so join just
    #   the filename, if not replace both the package name and the filename.
    #   This is a bit similar to stdlib urljoin() semantics for http URLs
    #
    # * If different scheme, also replace the base URL entirely.
    if parsedu.scheme == parsedb.scheme:
        # If pkgb == pkgu and filenameb == filenameu we take parsedu.path
        # But same if pkgb != pkgu or filenameb != filenameu,
        # So in all cases take the new path from url
        new_path = parsedu.path
    elif parsedu.scheme:
        # Some other scheme
        return url
    else:
        # A relative URL relative to the base py-pkgdata URL
        new_path = _urllib_urljoin(parsedb.path, parsedu.path)

    if parsedu.fragment:
        new_path += '#' + parsedu.fragment

    return 'py-pkgdata:' + new_path


def _urlparse_py_obj(url: str) -> ParseResult:
    """Parse URLs with the ``py-obj:`` scheme."""

    if not url.startswith('py-obj:'):
        raise ValueError(f'not a py-obj URL: {url}')

    _, rest = url.split(':', 1)
    if '#' in rest:
        path, fragment = rest.split('#', 1)
    else:
        path, fragment = rest, ''

    if not _is_dotted_identifier(path):
        fmt = 'py-obj:<identifier>[.<identifier>...][#<fragment>]'

        raise ValueError(
            f'py-obj URL path component {path} contains an invalid Python '
            f'indentifier; the correct format for a py-obj URL is: {fmt}')

    return ParseResult(scheme='py-obj', netloc='', path=path, params='',
                       query='', fragment=fragment)


def _urlparse_py_pkgdata(url: str) -> ParseResult:
    """Parse URLs with the ``py-pkgdata:`` scheme."""

    if not url.startswith('py-pkgdata:'):
        raise ValueError(f'not a py-pkgdata URL: {url}')

    _, rest = url.split(':', 1)
    if '#' in rest:
        path, fragment = rest.split('#', 1)
    else:
        path, fragment = rest, ''

    path = path.strip()
    fmt = 'py-pkgdata:<module>[.<module>...]/<path>[#<fragment>]'

    if '/' in path:
        module, filename = path.split('/', 1)
    else:
        module, filename = path, ''

    filename = filename.strip()

    if not filename:
        raise ValueError(
            f'py-pkgdata URL must contain a package-relative path to a file '
            f'contained in the package, starting with a "/"; the correct '
            f'format for a py-pkgdata URL is: {fmt}')

    if not _is_dotted_identifier(module):
        raise ValueError(
            f'py-obj URL module component {module} is not a valid Python '
            f'indentifier; the correct format for a py-obj URL is: {fmt}')

    return ParseResult(scheme='py-pkgdata', netloc='',
                       path=f'{module}/{filename}', params='', query='',
                       fragment=fragment)


URL_SCHEME_RESOLVERS = {
    'py-obj': _resolve_url_py_obj,
    'py-pkgdata': _resolve_url_py_pkgdata
}
"""
Additional URL handlers for the custom URL schemes supported by this package.

This can be passed to the ``handlers`` argument of `jsonschema.RefResolver`,
but take note: It is also necessary to override the ``urljoin_cache`` argument
to use `jsonschema_pyref.urljoin` as follows::

    >>> import jonschema
    >>> from jsonschema_pyref import URL_SCHEME_RESOLVERS, urljoin
    >>> from functools import lru_cache
    >>> resolver = RefResolver({}, {}, handlers=URL_SCHEME_RESOLVERS,
    ...                        urljoin_cache=lru_cache(1024)(urljoin))

This boilerplate can be avoided entirely by using the supplied
`jsonschema_pyref.RefResolver` which has these enhancements by default (and
is otherwise identical to `jsonschema.RefResolver`).
"""


_URL_SCHEME_JOINERS = {
    'py-obj': _urljoin_py_obj,
    'py-pkgdata': _urljoin_py_pkgdata
}


_URL_SCHEME_PARSERS = {
    'py-obj': _urlparse_py_obj,
    'py-pkgdata': _urlparse_py_pkgdata
}