# IMPORTS
# ===================================================

import coreferee
import spacy_transformers
import torch
import spacy
from spacy import displacy
import networkx as nx
import ipycytoscape
import ipywidgets as widgets
import json

# DESCRIPTION
# ===================================================
# A simple example of building a knowledge graph from 
# text. The example processes the input text with the 
# following processing:
#
# 1. Named Entity Extraction NER
# 2. Co-reference resolution 
# 3. Semantic type enhancement using wikidata
# 4. Relationship extraction
# 5. Create NetworkX graph
# Pipes
# ===================================================
#
# 1. NER using Spacy's web small model
# 2. Co-reference resolution using creferee
# 3. Semantic type lookup using entity linker to search wikidata

# https://spacy.io/models/en#en_core_web_sm
nlp = spacy.load('en_core_web_sm')
# https://spacy.io/universe/project/coreferee
nlp.add_pipe('coreferee')
# https://github.com/egerber/spaCy-entity-linker
nlp.add_pipe("entityLinker", last=True)
<spacy_entity_linker.EntityLinker.EntityLinker at 0x1b1a09f40>
# Example text to process
# ===================================================

nlp_example_text = 'Robert Jackson was born in Ely. Robert Jackson is a painter and he now lives in Cromer'
# Process document
doc = nlp(nlp_example_text)
# Display output of NER
displacy.render(doc, style="ent")
Robert Jackson PERSON was born in Ely GPE . Robert Jackson PERSON is a painter and he now lives in Cromer GPE
# Build co-reference looup
# ===================================================

# Parse the co-reference chains a build a lookup table based on the 
# token index so that any co-reference point to the original part 
# of speach
coref_lookup = {}
for c in doc._.coref_chains:
    for ment in c:
        # remove self reference
        if ment[0] != c[0][0]:
            coref_lookup.update({ment[0]: c[0][0]})
# Enrich on Spacy entities using wiki data
# ===================================================

# wikidata lookups need complicated 
# sparql queries so I've moved them
# into a seperate file for simplicity 
from wikilookup import lookupwiki
# Dict of sematanic types indexed on span
semantic_types = {}
# returns all entities in the document which
# have wikidata query hits
for le in doc._.linkedEntities:
    if len(le.span.ents) > 0:
        if le.span.ents[0].label_ == 'GPE':
            # For GPE's get the wikidata instance_of property
            # this is returning the first instance of property
            instance = lookupwiki(str(le.get_id()))
            semantic_types.update({str(le.span) : instance})
        else:
            semantic_types.update({str(le.span) : le.span.ents[0].label_})
    else:
        # If NER hasn't matched any named entities set the wikidata
        # lookup as type
        instance = lookupwiki(str(le.get_id()))
        semantic_types.update({str(le.span): instance})
# BUILD KNOWLEDGE GRAPH and RELATIONSHIP EXTRACTION
# ===================================================

# A simple relationship extraction model which builds
# triples form Subject-Verb-object in the document

# co-references are also resolved back to their orginal
# references

# Any compounds are resolved.

# NetorkX Nodes are created entities and the semantic
# types is assigned as a type property. NetworkX linkes
# are created from Relationships

from knowledgegraph import add_triple_to_graph

knowledge_graph = nx.Graph()

subject, relation, object = '','',''
for token in doc:
    #print(token.i,token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
    
    if token.i in coref_lookup.keys():
        
        token = doc[coref_lookup[token.i]]
        if token.dep_ in ["nsubjpass"]:
             token.children
             for t in token.children: token = t
    if not subject and token.dep_ in ["compound", "conj", "mod", "pobj"]:
        if token.dep_ == "compound":
             subject = str(doc[token.i: token.head.i + 1])
        else:
             subject = token.text
    elif token.pos_ in ["VERB", "AUX",]:
        relation = relation + token.text + " "
    elif subject and not object and relation and token.dep_ in ["compound", "conj", "mod", "pobj", "attr"]:
        object = token.text
    if subject and relation and object:
        add_triple_to_graph(knowledge_graph, subject, relation,object,semantic_types )
        subject, relation, object = '','',''



        
ipycytoscape_obj = ipycytoscape.CytoscapeWidget()
ipycytoscape_obj.graph.add_graph_from_networkx(knowledge_graph, directed=True)
ipycytoscape_obj

with open("./cytoscape_styles.json") as fi:
    s = json.load(fi)

ipycytoscape_obj.set_style(s)
ipycytoscape_obj