# IMPORTS
# ===================================================
import coreferee
import spacy_transformers
import torch
import spacy
from spacy import displacy
import networkx as nx
import ipycytoscape
import ipywidgets as widgets
import json
# DESCRIPTION
# ===================================================
# A simple example of building a knowledge graph from
# text. The example processes the input text with the
# following processing:
#
# 1. Named Entity Extraction NER
# 2. Co-reference resolution
# 3. Semantic type enhancement using wikidata
# 4. Relationship extraction
# 5. Create NetworkX graph
# Pipes
# ===================================================
#
# 1. NER using Spacy's web small model
# 2. Co-reference resolution using creferee
# 3. Semantic type lookup using entity linker to search wikidata
# https://spacy.io/models/en#en_core_web_sm
nlp = spacy.load('en_core_web_sm')
# https://spacy.io/universe/project/coreferee
nlp.add_pipe('coreferee')
# https://github.com/egerber/spaCy-entity-linker
nlp.add_pipe("entityLinker", last=True)
<spacy_entity_linker.EntityLinker.EntityLinker at 0x1b1a09f40>
# Example text to process
# ===================================================
nlp_example_text = 'Robert Jackson was born in Ely. Robert Jackson is a painter and he now lives in Cromer'
# Process document
doc = nlp(nlp_example_text)
# Display output of NER
displacy.render(doc, style="ent")
# Build co-reference looup
# ===================================================
# Parse the co-reference chains a build a lookup table based on the
# token index so that any co-reference point to the original part
# of speach
coref_lookup = {}
for c in doc._.coref_chains:
for ment in c:
# remove self reference
if ment[0] != c[0][0]:
coref_lookup.update({ment[0]: c[0][0]})
# Enrich on Spacy entities using wiki data
# ===================================================
# wikidata lookups need complicated
# sparql queries so I've moved them
# into a seperate file for simplicity
from wikilookup import lookupwiki
# Dict of sematanic types indexed on span
semantic_types = {}
# returns all entities in the document which
# have wikidata query hits
for le in doc._.linkedEntities:
if len(le.span.ents) > 0:
if le.span.ents[0].label_ == 'GPE':
# For GPE's get the wikidata instance_of property
# this is returning the first instance of property
instance = lookupwiki(str(le.get_id()))
semantic_types.update({str(le.span) : instance})
else:
semantic_types.update({str(le.span) : le.span.ents[0].label_})
else:
# If NER hasn't matched any named entities set the wikidata
# lookup as type
instance = lookupwiki(str(le.get_id()))
semantic_types.update({str(le.span): instance})
# BUILD KNOWLEDGE GRAPH and RELATIONSHIP EXTRACTION
# ===================================================
# A simple relationship extraction model which builds
# triples form Subject-Verb-object in the document
# co-references are also resolved back to their orginal
# references
# Any compounds are resolved.
# NetorkX Nodes are created entities and the semantic
# types is assigned as a type property. NetworkX linkes
# are created from Relationships
from knowledgegraph import add_triple_to_graph
knowledge_graph = nx.Graph()
subject, relation, object = '','',''
for token in doc:
#print(token.i,token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
if token.i in coref_lookup.keys():
token = doc[coref_lookup[token.i]]
if token.dep_ in ["nsubjpass"]:
token.children
for t in token.children: token = t
if not subject and token.dep_ in ["compound", "conj", "mod", "pobj"]:
if token.dep_ == "compound":
subject = str(doc[token.i: token.head.i + 1])
else:
subject = token.text
elif token.pos_ in ["VERB", "AUX",]:
relation = relation + token.text + " "
elif subject and not object and relation and token.dep_ in ["compound", "conj", "mod", "pobj", "attr"]:
object = token.text
if subject and relation and object:
add_triple_to_graph(knowledge_graph, subject, relation,object,semantic_types )
subject, relation, object = '','',''
ipycytoscape_obj = ipycytoscape.CytoscapeWidget()
ipycytoscape_obj.graph.add_graph_from_networkx(knowledge_graph, directed=True)
ipycytoscape_obj
with open("./cytoscape_styles.json") as fi:
s = json.load(fi)
ipycytoscape_obj.set_style(s)
ipycytoscape_obj