maplib
1# r''' 2# # Overview 3# 4# ''' 5 6import logging 7logger = logging.getLogger(__name__) 8 9__all__ = [ 10 "Model", 11 "a", 12 "Triple", 13 "SolutionMappings", 14 "IndexingOptions", 15 "ValidationReport", 16 "Instance", 17 "Template", 18 "Argument", 19 "Parameter", 20 "Variable", 21 "RDFType", 22 "xsd", 23 "rdf", 24 "rdfs", 25 "owl", 26 "IRI", 27 "Literal", 28 "Prefix", 29 "BlankNode", 30 "explore", 31 "add_triples", 32 "generate_templates", 33 "MaplibException", 34] 35 36import pathlib 37from importlib.metadata import version 38from .maplib import * 39from .adding_triples import add_triples 40from .template_generator import generate_templates 41 42""" 43http://www.w3.org/1999/02/22-rdf-syntax-ns#type 44""" 45a = rdf.type 46 47if (pathlib.Path(__file__).parent.resolve() / "graph_explorer").exists(): 48 from .graph_explorer import explore as _explore 49else: 50 51 def _explore( 52 m: "Model", 53 host: str = "localhost", 54 port: int = 8000, 55 bind: str = "localhost", 56 popup=True, 57 fts=True, 58 fts_path:str="fts", 59 ): 60 print("Contact Data Treehouse to try!") 61 62 63def explore(*args, **kwargs): 64 """Deprecated way to start an explore session. 65Use the explore method on a Model object instead 66""" 67 logger.warn("Calling `maplib.explore` is deprecated, use `m.explore()` on a `Model` object instead") 68 if kwargs.get("popup") == None or kwargs.get("popup") == True: 69 logger.warn("""Calling explore without a popup argument defaults to it being on. 70The popup argument is deprecated, so if you are relying on explore() opening a browser window 71please change this to something like 72 73``` 74import webbrowser 75from maplib import Model 76 77m = Model() 78... 79s = m.explore() 80webbrowser.open(s.url, new=2) 81``` 82""") 83 kwargs["popup"] = True 84 elif kwargs.get("popup") == False: 85 logger.warn("The new explore function on a Model, no longer defaults to popping up the browser ") 86 87 88 return _explore(*args, **kwargs) 89 90__version__ = version("maplib")
A model session allowing:
- Iterative model using OTTR templates
- Interactive SPARQL querying and enrichment
- SHACL validation
Usage:
>>> from maplib import Model
... doc = '''
... @prefix ex:<http://example.net/ns#>.
... ex:ExampleTemplate [?MyValue] :: {
... ottr:Triple(ex:myObject, ex:hasValue, ?MyValue)
... } .'''
... m = Model()
... m.add_template(doc)
Parameters
- documents: a stOTTR document or a list of these
- indexing_options: options for indexing
Add a template to the model. Overwrites any existing template with the same IRI.
Parameters
- template: The template to add, as a stOTTR string or as a programmatically constructed Template.
Returns
Add prefixes that will be used in parsing of SPARQL, Datalog and OTTR.
Usage:
>>> m.add_prefixes({"ex" : "http:://example.net/"})
Parameters
- prefixes: Known prefixes
Returns
Detaches and returns a named graph as their own Model object. The named graph is removed from the original Model.
Parameters
- graph: The name of the graph to detach. Defaults to the default graph.
- preserve_name: Preserve the name of the graph in the new Model, defaults to False.
Returns
A model.
Map a template using a DataFrame Usage:
>>> m.map("ex:ExampleTemplate", df)
If the template has no arguments, the df argument is not necessary.
Parameters
- template: Template, IRI, IRI string or prefixed template name.
- df: DataFrame where the columns have the same names as the template arguments
- graph: The IRI of the graph to add triples to.
- types: The types of the columns.
- validate_iris: Validate any IRI-columns.
Map a JSON file or string to triples. Usage:
>>> m.map_json("my_doc.json")
or:
>>> m.map_json('{"my_key":[true, "abc"]}')
Parameters
- path_or_string: Path to a JSON document or a JSON string.
- graph: The IRI of the graph to add triples to. None is the default graph.
- transient: Should the triples be included when serializing the graph?
Map a template using a DataFrame with columns subject, object and predicate The predicate column can also be supplied as a string if it is the same for all rows. Usage:
>>> m.map_triples(df)
If the template has no arguments, the df argument is not necessary.
Parameters
- df: DataFrame where the columns are named subject and object. May also contain a verb-column.
- verb: The uri of the verb.
- graph: The IRI of the graph to add triples to.
- types: The types of the columns.
- validate_iris: Validate any IRI-columns.
Create a default template and map it based on a dataframe. Usage:
>>> template_string = m.map_default(df, "myKeyCol")
... print(template_string)
Parameters
- df: DataFrame where the columns have the same names as the template arguments
- primary_key_column: This column will be the subject of all triples in the generated template.
- dry_run: Do not map the template, only return the string.
- graph: The IRI of the graph to add triples to.
- types: The types of the columns.
- validate_iris: Validate any IRI-columns.
Returns
The generated template
Starts a graph explorer session. To run from Jupyter Notebook use:
>>> server = m.explore()
You can later stop the server with
>>> server.stop()
Parameters
- host: The hostname that we will point the browser to.
- port: The port where the graph explorer webserver listens on.
- bind: Bind to the following host / ip.
- fts: Enable full text search indexing
Query the contained knowledge graph using SPARQL Currently, SELECT, CONSTRUCT and INSERT are supported. Usage:
>>> df = model.query('''
... PREFIX ex:<http://example.net/ns#>
... SELECT ?obj1 ?obj2 WHERE {
... ?obj1 ex:hasObj ?obj2
... }''')
... print(df)
Parameters
- query: The SPARQL query string
- parameters: PVALUES Parameters, a DataFrame containing the value bindings in the custom PVALUES construction.
- native_dataframe: Return columns with maplib-native formatting. Useful for round-trips.
- include_datatypes: Datatypes are not returned by default, set to true to return a dict with the solution mappings and the datatypes.
- graph: The IRI of the graph to query.
- streaming: Use Polars streaming
- return_json: Return JSON string.
- include_transient: Include transient triples when querying.
- max_rows: Maximum estimated rows in result, helps avoid out-of-memory errors.
- debug: Why does my query have no results?
Returns
DataFrame (Select), list of DataFrames (Construct) containing results, None for Insert-queries, or SolutionMappings when include_datatypes is set.
Insert the results of a Construct query in the graph. Useful for being able to use the same query for inspecting what will be inserted and actually inserting. Usage:
>>> m = Model(doc)
... # Omitted
... update_pizzas = '''
... ...'''
... m.update(update_pizzas)
Parameters
- update: The SPARQL Update string
- parameters: PVALUES Parameters, a DataFrame containing the value bindings in the custom PVALUES construction.
- streaming: Use Polars streaming
- include_transient: Include transient triples when querying (but see "transient" above).
- max_rows: Maximum estimated rows in result, helps avoid out-of-memory errors.
- debug: Why does my query have no results?
Returns
None
Parameters
- options: Indexing options
- all: Apply to all existing and new graphs
- graph: The graph where indexes should be added
Returns
Validate the contained knowledge graph using SHACL Assumes that the contained knowledge graph also contains SHACL Shapes.
Parameters
- shape_graph: The IRI of the Shape Graph.
- data_graph: The IRI of the Data Graph (defaults to the default graph).
- include_details: Include details of SHACL evaluation alongside the report. Currently uses a lot of memory.
- include_conforms: Include those results that conformed. Also applies to details.
- include_shape_graph: Include the shape graph in the report, useful when creating the graph from the report.
- include_datatypes: Return the datatypes of the validation report (and details).
- streaming: Use Polars streaming
- max_shape_constraint_results: Maximum number of results per shape and constraint. Reduces the size of the result set.
- only_shapes: Validate only these shapes, None means all shapes are validated (must be IRI, cannot be used with deactivate_shapes).
- deactivate_shapes: Disable validation of these shapes (must be IRI, cannot be used with deactivate_shapes).
- dry_run: Only find targets of shapes, but do not validate them.
- max_rows: Maximum estimated rows in underlying SPARQL results, helps avoid out-of-memory errors.
- serial: Turns off most parallell validation of shapes.
Returns
Validation report containing a report (report.df) and whether the graph conforms (report.conforms)
Insert the results of a Construct query in the graph. Useful for being able to use the same query for inspecting what will be inserted and actually inserting. Usage:
>>> m = Model(doc)
... # Omitted
... hpizzas = '''
... PREFIX pizza:<https://github.com/magbak/maplib/pizza#>
... PREFIX ing:<https://github.com/magbak/maplib/pizza/ingredients#>
... CONSTRUCT { ?p a pizza:HeterodoxPizza }
... WHERE {
... ?p a pizza:Pizza .
... ?p pizza:hasIngredient ing:Pineapple .
... }'''
... m.insert(hpizzas)
Parameters
- query: The SPARQL Insert query string
- parameters: PVALUES Parameters, a DataFrame containing the value bindings in the custom PVALUES construction.
- native_dataframe: Return columns with maplib-native formatting. Useful for round-trips.
- include_datatypes: Datatypes are not returned by default, set to true to return a dict with the solution mappings and the datatypes.
- transient: Should the inserted triples be transient?
- source_graph: The IRI of the source graph to execute the construct query.
- target_graph: The IRI of the target graph to insert into.
- streaming: Use Polars streaming
- include_transient: Include transient triples when querying (but see "transient" above).
- max_rows: Maximum estimated rows in result, helps avoid out-of-memory errors.
- debug: Why does my query have no results?
Returns
None
Reads triples from a file path. You can specify the format, or it will be derived using file extension, e.g. filename.ttl or filename.nt. Specify transient if you only want the triples to be available for further querying and validation, but not persisted using write-methods.
Usage:
>>> m.read("my_triples.ttl")
Parameters
- file_path: The path of the file containing triples
- format: One of "ntriples", "turtle", "rdf/xml", "json-ld" or "cim/xml", otherwise it is inferred from the file extension.
- base_iri: Base iri
- transient: Should these triples be included when writing the graph to the file system?
- parallel: Parse triples in parallel, currently only NTRiples and Turtle. Assumes all prefixes are in the beginning of the document. Defaults to true only for NTriples.
- checked: Check IRIs etc.
- graph: The IRI of the graph to read the triples into, if None, it will be the default graph.
- replace_graph: Replace the graph with these triples? Will replace the default graph if no graph is specified.
- known_contexts: Contexts in JSON-LD documents are resolved towards this dict.
Reads template(s) from a file path.
Usage:
>>> m.read("templates.ttl")
Parameters
- file_path: The path of the file containing templates in stOTTR format
Reads triples from a string. Specify transient if you only want the triples to be available for further querying and validation, but not persisted using write-methods.
Usage:
>>> m.reads(my_ntriples_string, format="ntriples")
Parameters
- s: String containing serialized triples.
- format: One of "ntriples", "turtle", "rdf/xml", "json-ld" or "cim/xml".
- base_iri: Base iri
- transient: Should these triples be included when writing the graph to the file system?
- parallel: Parse triples in parallel, currently only NTRiples and Turtle. Assumes all prefixes are in the beginning of the document. Defaults to true for NTriples.
- checked: Check IRIs etc.
- graph: The IRI of the graph to read the triples into.
- replace_graph: Replace the graph with these triples? Will replace the default graph if no graph is specified.
- known_contexts: Contexts in JSON-LD documents are resolved towards this dict.
Write the non-transient triples to the file path specified in the NTriples format.
Usage:
>>> m.write("my_triples.nt", format="ntriples")
Parameters
- file_path: The path of the file containing triples
- format: One of "ntriples", "turtle", "rdf/xml".
- graph: The IRI of the graph to write.
- prefixes: The prefixes that will be used in turtle serialization.
Write the legacy CIM XML format.
>>> PROFILE_GRAPH = "urn:graph:profiles"
>>> m = Model()
>>> m.read(model_path, base_iri=publicID, format="rdf/xml")
>>> m.read("61970-600-2_Equipment-AP-Voc-RDFS2020_v3-0-0.rdf", graph=PROFILE_GRAPH, format="rdf/xml")
>>> m.read("61970-600-2_Operation-AP-Voc-RDFS2020_v3-0-0.rdf", graph=PROFILE_GRAPH, format="rdf/xml")
>>> m.write_cim_xml(
>>> "model.xml",
>>> profile_graph=PROFILE_GRAPH,
>>> description = "MyModel",
>>> created = "2023-09-14T20:27:41",
>>> scenario_time = "2023-09-14T02:44:43",
>>> modeling_authority_set="www.westernpower.co.uk",
>>> version="22",
>>> )
Parameters
- file_path: The path of the file containing triples
- profile_graph: The IRI of the graph containing the ontology of the CIM profile to write.
- model_iri: model_iri a md: FullModel. Is generated if not provided.
- version: model_iri md: Model.version version .
- description: model_iri md: Model.description description .
- created: model_iri md: Model.created created .
- scenario_time: model_iri md: Model.scenarioTime scenario_time .
- modeling_authority_set: model_iri md: Model.modelingAuthoritySet modeling_authority_set .
- prefixes: Prefixes to be used in XML export.
- graph: The graph to write, defaults to the default graph.
Write the non-transient triples to a string in memory.
Usage:
>>> s = m.writes(format="turtle")
Parameters
- format: One of "ntriples", "turtle", "rdf/xml".
- graph: The IRI of the graph to write.
- prefixes: The prefixes used for turtle serialization. :return Triples in model in the NTriples format (potentially a large string)
Write non-transient triples using the internal native Parquet format.
Usage:
>>> m.write_native_parquet("output_folder")
Parameters
- folder_path: The path of the folder to write triples in the native format.
- graph: The IRI of the graph to write.
Parameters
- graph: The graph to get the predicate iris from.
- include_transient: Should we include predicates only between transient triples?
Returns
The IRIs of the predicates currently in the given graph.
Parameters
- iri: The predicate IRI
- graph: The graph to get the predicate from.
- include_transient: Should we include transient triples?
Returns
A list of the underlying tables that store a given predicate.
Run the inference rules that are provided
Parameters
- ruleset: The Datalog ruleset (a string).
- graph: Apply the ruleset to this graph, defaults to the default graph, or the graph specified in the rules.
- native_dataframe: Return columns with maplib-native formatting. Useful for round-trips.
- include_datatypes: Datatypes are not returned by default, set to true to return a dict with the solution mappings and the datatypes.
- max_iterations: Maximum number of iterations.
- max_results: Maximum number of results.
- include_transient: Include transient triples when reasoning.
- max_rows: Maximum estimated rows in result, helps avoid out-of-memory errors.
- debug: Debugs rule bodies for executions that give no triples.
Returns
The inferred N-Tuples.
An OTTR Triple Pattern used for creating templates. This is the basis pattern which all template instances are rewritten into. Equivalent to:
>>> ottr = Prefix("http://ns.ottr.xyz/0.4/")
... Instance(ottr.suf("Triple"), subject, predicate, object, list_expander)
Parameters
- subject:
- predicate:
- object:
- list_expander:
Returns
Detailed information about the solution mappings, the types of the variables and debugging for queries.
Options for indexing
Defaults to indexing on subjects and objects for select types (e.g. rdf:type and rdfs:label)
Parameters
- object_sort_all: Enable object-indexing for all suitable predicates (doubles memory requirement).
- object_sort_some: Enable object-indexing for a selected list of predicates.
- fts_path: Enable full text search, stored at the path
- subject_object_index: An index used to deduplicate before insertion, speeds up mapping at a moderate memory cost
SHACL Validation report. Only constructed by maplib.
Return the results of the validation report, if they exist.
Parameters
- native_dataframe: Return columns with maplib-native formatting. Useful for round-trips.
- include_datatypes: Return datatypes of the results DataFrame (returns SolutionMappings instead of DataFrame).
- streaming: Use the Polars streaming functionality.
Returns
The SHACL validation report, as a DataFrame
Returns the details of the validation report. Only available if validation was called with include_details=True.
Parameters
- native_dataframe: Return columns with maplib-native formatting. Useful for round-trips.
- include_datatypes: Return datatypes of the results DataFrame (returns SolutionMappings instead of DataFrame).
- streaming: Use the Polars streaming functionality.
Returns
Details of the SHACL validation report, as a DataFrame
Creates a new model object where the base graph is the validation report with results. Includes the details of the validation report in the new graph if they exist.
Returns
A template instance.
Parameters
- iri: The IRI of the template to be instantiated.
- arguments: The arguments for template instantiation.
- list_expander: (How) should we do list expansion?
Parameters
- arguments: The arguments to the template.
- list_expander: (How) should we list-expand?
Returns
An OTTR Template. Note that accessing parameters- or instances-fields returns copies. To change these fields, you must assign new lists of parameters or instances.
Create a new parameter for a Template.
Parameters
- variable: The variable.
- optional: Can the variable be unbound?
- allow_blank: Can the variable be bound to a blank node?
- rdf_type: The type of the variable. Can be nested.
- default_value: Default value when no value provided.
A variable in a template.
The type of a column containing a RDF variable. For instance, IRIs are RDFType.IRI and a string literal is RDFType.Literal("http://www.w3.org/2001/XMLSchema#string")
The xsd namespace: http://www.w3.org/2001/XMLSchema#
The rdf namespace: http://www.w3.org/1999/02/22-rdf-syntax-ns#
The rdfs namespace: http://www.w3.org/1999/02/22-rdf-syntax-ns#
The owl namespace: http://www.w3.org/2002/07/owl#
An RDF literal.
Create a new RDF Literal
Parameters
- value: The lexical representation of the value.
- data_type: The data type of the value (an IRI).
- language: The language tag of the value.
A prefix that can be used to ergonomically build iris.
Create a new prefix.
Parameters
- iri: The prefix IRI.
- prefix_name: The name of the prefix
A Blank Node.
64def explore(*args, **kwargs): 65 """Deprecated way to start an explore session. 66Use the explore method on a Model object instead 67""" 68 logger.warn("Calling `maplib.explore` is deprecated, use `m.explore()` on a `Model` object instead") 69 if kwargs.get("popup") == None or kwargs.get("popup") == True: 70 logger.warn("""Calling explore without a popup argument defaults to it being on. 71The popup argument is deprecated, so if you are relying on explore() opening a browser window 72please change this to something like 73 74``` 75import webbrowser 76from maplib import Model 77 78m = Model() 79... 80s = m.explore() 81webbrowser.open(s.url, new=2) 82``` 83""") 84 kwargs["popup"] = True 85 elif kwargs.get("popup") == False: 86 logger.warn("The new explore function on a Model, no longer defaults to popping up the browser ") 87 88 89 return _explore(*args, **kwargs)
Starts a graph explorer session. To run from Jupyter Notebook use:
>>> from maplib import explore
>>>
>>> server = explore(m)
You can later stop the server with
>>> server.stop()
Parameters
- m: The Model to explore
- host: The hostname that we will point the browser to.
- port: The port where the graph explorer webserver listens on.
- bind: Bind to the following host / ip.
- popup: Pop up the browser window.
- fts: Enable full text search indexing
5def add_triples( 6 source: Model, target: Model, source_graph: str = None, target_graph: str = None 7): 8 """(Zero) copy the triples from one Model into another. 9 10 :param source: The source model 11 :param target: The target model 12 :param source_graph: The named graph in the source model to copy from. None means default graph. 13 :param target_graph: The named graph in the target model to copy into. None means default graph. 14 """ 15 for p in source.get_predicate_iris(source_graph): 16 subject = Variable("subject") 17 object = Variable("object") 18 template = Template( 19 iri=IRI("urn:maplib:tmp"), 20 parameters=[subject, object], 21 instances=[Triple(subject, p, object)], 22 ) 23 sms = source.get_predicate(p, source_graph) 24 for sm in sms: 25 target.map( 26 template, 27 sm.mappings, 28 types=sm.rdf_types, 29 graph=target_graph, 30 )
(Zero) copy the triples from one Model into another.
Parameters
- source: The source model
- target: The target model
- source_graph: The named graph in the source model to copy from. None means default graph.
- target_graph: The named graph in the target model to copy into. None means default graph.
9def generate_templates(m: Model, graph: Optional[str]) -> Dict[str, Template]: 10 """Generate templates for instantiating the classes in an ontology 11 12 :param m: The model where the ontology is stored. We mainly rely on rdfs:subClassOf, rdfs:range and rdfs:domain. 13 :param graph: The named graph where the ontology is stored. 14 15 :return A dictionary of templates for instantiating the classes in the ontology, where the keys are the class URIs. 16 17 Usage example - note that it is important to add the templates to the Model you want to populate. 18 >>> from maplib import Model, create_templates 19 >>> 20 >>> m_ont = Model() 21 >>> m_ont.read("my_ontology.ttl") 22 >>> templates = generate_templates(m_ont) 23 >>> m = Model() 24 >>> for t in templates.values(): 25 >>> m.add_template(t) 26 >>> m.map("https://example.net/MyClass", df) 27 """ 28 29 properties = get_properties(m, graph=graph) 30 properties_by_domain = {} 31 properties_by_range = {} 32 for r in properties.iter_rows(named=True): 33 dom = r["domain"] 34 if dom in properties_by_domain: 35 properties_by_domain[dom].append(r) 36 else: 37 properties_by_domain[dom] = [r] 38 39 ran = r["range"] 40 if ran in properties_by_range: 41 properties_by_range[ran].append(r) 42 else: 43 properties_by_range[ran] = [r] 44 45 subclasses = get_subclasses(m, graph=graph) 46 47 subclass_of = {} 48 for r in ( 49 subclasses.group_by("child") 50 .agg(pl.col("parent").alias("parents")) 51 .iter_rows(named=True) 52 ): 53 subclass_of[r["child"]] = r["parents"] 54 55 class_ordering = topological_sort(subclasses) 56 57 templates_without_typing = generate_templates_without_typing( 58 properties_by_domain, properties_by_range, class_ordering, subclass_of 59 ) 60 templates_with_typing = generate_templates_with_typing(templates_without_typing) 61 templates = {} 62 for t, template in templates_without_typing.items(): 63 64 templates[t + "_notype"] = template 65 for t, template in templates_with_typing.items(): 66 templates[t] = template 67 68 return templates
Generate templates for instantiating the classes in an ontology
Parameters
- m: The model where the ontology is stored. We mainly rely on rdfs:subClassOf, rdfs:range and rdfs: domain.
- graph: The named graph where the ontology is stored.
:return A dictionary of templates for instantiating the classes in the ontology, where the keys are the class URIs.
Usage example - note that it is important to add the templates to the Model you want to populate.
Common base class for all non-exit exceptions.