Skip to content

Commit

Permalink
Merge pull request #208 from semantic-systems/develop
Browse files Browse the repository at this point in the history
Going live with Publication and Researcher details pages
  • Loading branch information
abdullah-rana authored Jun 28, 2024
2 parents d71b272 + 400f632 commit 8fa8a49
Show file tree
Hide file tree
Showing 32 changed files with 1,758 additions and 779 deletions.
10 changes: 9 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ search_url_ieee: http://ieeexploreapi.ieee.org/api/v1/search/articles?apikey={ap
search_url_eudat: https://b2share.eudat.eu/api/records/?page=1&size=25&sort=bestmatch&q=
search_url_openaire_products: https://api.openaire.eu/search/researchProducts?format=json&size=25&keywords=

publication_details_openalex_publications: https://api.openalex.org/works/
publication_details_crossref_publications: https://api.crossref.org/works/
publication_details_semanticscholar_publication: https://api.semanticscholar.org/graph/v1/paper/
publication_details_semanticscholar_recommendations: https://api.semanticscholar.org/recommendations/v1/papers/forpaper/

search_url_orcid: https://pub.orcid.org/v3.0/expanded-search/?start=0&rows=25&q=
search_url_dblp: https://dblp.org/search?q=
search_url_orkg: https://orkg.org/api/resources/?size=20&q=
Expand All @@ -26,8 +31,11 @@ settings_file_publications: static/weights/publications-settings.json
number_of_records_to_show_on_page_load: 20
number_of_records_to_append_on_lazy_load: 10

chatbot_feature_enable: True
chatbot_server: https://nfdi-chatbot.nliwod.org
# chatbot_server: http://127.0.0.1:5005
endpoint_chat: /chat
endpoint_save_docs_with_embeddings: /save-docs-with-embeddings
endpoint_are_embeddings_generated: /are-embeddings-generated
endpoint_are_embeddings_generated: /are-embeddings-generated

openai_api_key: <API_KEY>
200 changes: 142 additions & 58 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import uuid
# from objects import Person, Zenodo, Article, Dataset, Presentation, Poster, Software, Video, Image, Lesson, Institute, Funder, Publisher, Gesis, Cordis, Orcid, Gepris
from objects import Article, Organization, Person, Dataset, Project
from flask import Flask, render_template, request, make_response, session
from flask import Flask, render_template, request, make_response, session, jsonify, request
from flask_session import Session
import threading
from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications
from sources import dblp_publications, openalex_publications, zenodo, wikidata_publications, wikidata_researchers, openalex_researchers
from sources import resodate, oersi, ieee, eudat, openaire_products
from sources import dblp_researchers
from sources import crossref, semanticscholar
from sources import cordis, gesis, orcid, gepris, eulg, re3data, orkg

from chatbot import chatbot
Expand All @@ -31,6 +32,17 @@
app.config["SESSION_TYPE"] = "filesystem"
Session(app)

results = {
'publications': [],
'researchers': [],
'resources': [],
'organizations': [],
'events': [],
'fundings': [],
'others': [],
'timedout_sources': []
}

@app.route('/')
def index():
response = make_response(render_template('index.html'))
Expand Down Expand Up @@ -61,23 +73,14 @@ def search_results():
search_term = request.args.get('txtSearchTerm')
session['search-term'] = search_term

results = {
'publications': [],
'researchers': [],
'resources': [],
'organizations': [],
'events': [],
'fundings': [],
'others': [],
'timedout_sources': []
}
for k in results.keys(): results[k] = []
threads = []

# add all the sources here in this list; for simplicity we should use the exact module name
# ensure the main method which execute the search is named "search" in the module
sources = [dblp_publications, openalex_publications, zenodo, wikidata_publications, resodate, oersi, ieee,
eudat, openaire_products, dblp_researchers, re3data, orkg]
# sources = [dblp_researchers]
eudat, openaire_products, re3data, orkg, openalex_researchers]
# sources = [openalex_publications]
for source in sources:
t = threading.Thread(target=source.search, args=(search_term, results,))
t.start()
Expand All @@ -97,24 +100,28 @@ def search_results():
#store the search results in the session
session['search-results'] = copy.deepcopy(results)

# Convert a UUID to a 32-character hexadecimal string
search_uuid = uuid.uuid4().hex
session['search_uuid'] = search_uuid

def send_search_results_to_chatbot(search_uuid: str):
print('request is about to start')
chatbot_server = utils.config['chatbot_server']
save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings']
request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}'
response = requests.post(request_url, json=json.dumps(results, default=vars))
response.raise_for_status()
print('request completed')

# create a new daemon thread
chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True)
# start the new thread
chatbot_thread.start()
# sleep(1)
# Chatbot - push search results to chatbot server for embeddings generation
if utils.config['chatbot_feature_enable'] == "True":

# Convert a UUID to a 32-character hexadecimal string
search_uuid = uuid.uuid4().hex
session['search_uuid'] = search_uuid

def send_search_results_to_chatbot(search_uuid: str):
print('request is about to start')
chatbot_server = utils.config['chatbot_server']
save_docs_with_embeddings = utils.config['endpoint_save_docs_with_embeddings']
request_url = f'{chatbot_server}{save_docs_with_embeddings}/{search_uuid}'
response = requests.post(request_url, json=json.dumps(results, default=vars))
response.raise_for_status()
print('request completed')

# create a new daemon thread
chatbot_thread = threading.Thread(target=send_search_results_to_chatbot, args=(search_uuid,), daemon=True)
# start the new thread
chatbot_thread.start()
# sleep(1)


# on the first page load, only push top 20 records in each category
Expand Down Expand Up @@ -171,19 +178,23 @@ def load_more_researchers():

@app.route('/are-embeddings-generated', methods=['GET'])
def are_embeddings_generated():
print('are_embeddings_generated')
uuid = session['search_uuid']
chatbot_server = utils.config['chatbot_server']
are_embeddings_generated = utils.config['endpoint_are_embeddings_generated']
request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}"
headers = {
'Content-Type': 'application/json'
}
response = requests.request("GET", request_url, headers=headers)
json_response = response.json()
print('json_response:', json_response)
return str(json_response['file_exists'])

#Check the embeddings readiness only if the chatbot feature is enabled otherwise return False
if utils.config['chatbot_feature_enable'] == "True":
print('are_embeddings_generated')
uuid = session['search_uuid']
chatbot_server = utils.config['chatbot_server']
are_embeddings_generated = utils.config['endpoint_are_embeddings_generated']
request_url = f"{chatbot_server}{are_embeddings_generated}/{uuid}"
headers = {
'Content-Type': 'application/json'
}
response = requests.request("GET", request_url, headers=headers)
json_response = response.json()
print('json_response:', json_response)
return str(json_response['file_exists'])
else:
return str(True)

@app.route('/get-chatbot-answer', methods=['GET'])
def get_chatbot_answer():
Expand Down Expand Up @@ -215,25 +226,78 @@ def get_chatbot_answer():
# return response


from jinja2.filters import FILTERS
import json
def format_digital_obj_url(value):
sources_list = []
for source in value.source:
source_dict = {}
source_dict['doi'] = value.identifier
source_dict['sname'] = source.name
source_dict['sid'] = source.identifier
sources_list.append(source_dict)
return json.dumps(sources_list)
FILTERS["format_digital_obj_url"] = format_digital_obj_url

def format_authors_for_citations(value):
authors = ""
for author in value:
authors += (author.name + " and ")
return authors.rstrip(' and ') + "."
FILTERS["format_authors_for_citations"] = format_authors_for_citations

import re
def regex_replace(s, find, replace):
"""A non-optimal implementation of a regex filter"""
return re.sub(find, replace, s)
FILTERS["regex_replace"] = regex_replace

from urllib.parse import unquote
import ast

@app.route('/publication-details/<path:sources>', methods=['GET'])
@utils.timeit
def publication_details(sources):

sources = unquote(sources)
sources = ast.literal_eval(sources)
for source in sources:
doi = source['doi']

publication = openalex_publications.get_publication(doi="https://doi.org/"+doi)
response = make_response(render_template('publication-details.html', publication=publication))

@app.route('/publication-details/<string:doi>', methods=['POST', 'GET'])
@utils.timeit
def publication_details(doi):
doi = request.args.get('doi', '').replace('-.-', '/')
print(doi)
print("response:", response)
return response

response = make_response(render_template('publication-details.html'))
@app.route('/publication-details-references/<path:doi>', methods=['GET'])
@utils.timeit
def publication_details_references(doi):
print("doi:", doi)

publication = crossref.get_publication(doi=doi)
response = make_response(render_template('partials/publication-details/references.html', publication=publication))

# Set search-session cookie to the session cookie value of the first visit
if request.cookies.get('search-session') is None:
if request.cookies.get('session') is None:
response.set_cookie('search-session', str(uuid.uuid4()))
else:
response.set_cookie('search-session', request.cookies['session'])
print("response:", response)
return response

@app.route('/publication-details-recommendations/<path:doi>', methods=['GET'])
@utils.timeit
def publication_details_recommendations(doi):
print("DOI:", doi)
publications = semanticscholar.get_recommendations_for_publication(doi=doi)
response = make_response(render_template('partials/publication-details/recommendations.html', publications=publications))
print("response:", response)
return response

@app.route('/publication-details-citations/<path:doi>', methods=['GET'])
@utils.timeit
def publication_details_citations(doi):
print("DOI:", doi)
publications = semanticscholar.get_citations_for_publication(doi=doi)
response = make_response(render_template('partials/publication-details/citations.html', publications=publications))
print("response:", response)
return response

@app.route('/resource-details')
def resource_details():
Expand All @@ -249,9 +313,16 @@ def resource_details():
return response


@app.route('/researcher-details')
def researcher_details():
response = make_response(render_template('researcher-details.html'))
@app.route('/researcher-details/<string:index>', methods=['GET'])
def researcher_details(index):
# index = json.loads(index)
# for result in results['researchers']:
# if result.source[0].identifier.replace("https://openalex.org/", "") == index[0]['sid']:
# researcher = result
# break
# logger.info(f'Found researcher {researcher}')
researcher = openalex_researchers.get_researcher_details(index)
response = make_response(render_template('researcher-details.html',researcher=researcher))

# Set search-session cookie to the session cookie value of the first visit
if request.cookies.get('search-session') is None:
Expand All @@ -262,6 +333,19 @@ def researcher_details():

return response

@app.route('/researcher-banner/<string:index>', methods=['GET'])
def researcher_banner(index):
# logger.info(f'Fetching details for researcher with index {index}')
for result in results['researchers']:
if result.list_index == index:
researcher = result
break
# logger.info(f'Found researcher {researcher}')
researcher = openalex_researchers.get_researcher_banner(researcher)
if researcher.banner == "":
return jsonify()
return jsonify(imageUrl = f'data:image/jpeg;base64,{researcher.banner}')


@app.route('/organization-details/<string:organization_id>/<string:organization_name>', methods=['GET'])
def organization_details(organization_id, organization_name):
Expand Down
19 changes: 12 additions & 7 deletions objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,12 @@ class Person(thing):
Organization.parentOrganization = Organization()


@dataclass
class Author(Person):
# orcid: str = "" # we should not have this attribute; orcid should be kept in
works_count: str = ""
cited_by_count: str = ""

@dataclass
class CreativeWork(thing):
abstract: str = ""
alternativeHeadline: str = ""
author: List[Union[Organization, Person]] = field(default_factory=list)
citation: str = "" # this should actually reference to articles
citation: list() = field(default_factory=list) # this list will have "CreativeWork" objects
countryOfOrigin: str = ""
creativeWorkStatus: str = ""
dateCreated: str = ""
Expand Down Expand Up @@ -107,6 +101,17 @@ class Dataset(CreativeWork):
distribution: str = ""
issn: str = ""

@dataclass
class Author(Person):
orcid: str = "" # we should not have this attribute; orcid should be kept in
works_count: str = ""
about: str = ""
banner: str = ""
cited_by_count: str = ""
url: str = ""
researchAreas: List[str] = field(default_factory=list)
works: List[Union[Article, Dataset]] = field(default_factory=list)

#The 'Project' is a new addition to schema.org, and as of now, there are no defined properties for it
@dataclass
class Project(Organization):
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# gradio~=3.9.1
lxml==5.1.0
flask==2.3.2
extruct~=0.14.0
rdflib~=6.2.0
Expand All @@ -17,3 +18,4 @@ xmltodict
dateparser>=1.2.0
Flask-Session==0.5.0
rank_bm25==0.2.2
openai==1.35.3
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
45 changes: 45 additions & 0 deletions sources/crossref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import requests
from objects import thing, Article, Author
import logging
import utils
from sources import data_retriever
import traceback


# logging.config.fileConfig(os.getenv('LOGGING_FILE_CONFIG', './logging.conf'))
logger = logging.getLogger('nfdi_search_engine')

@utils.timeit
def get_publication(doi: str):

source = "CROSSREF Publication"

try:
search_result = data_retriever.retrieve_single_object(source=source,
base_url=utils.config["publication_details_crossref_publications"],
doi=doi)

search_result = search_result.get('message',{})

publication = Article()

title = search_result.get("title")
publication.name = utils.remove_html_tags(title[0])
publication.identifier = search_result.get("DOI", "").replace("https://doi.org/", "")
publication.abstract = utils.remove_html_tags(search_result.get("abstract", ""))

references = search_result.get("reference", [])
for reference in references:
referenced_publication = Article()
referenced_publication.text = reference.get("unstructured", "")
referenced_publication.identifier = reference.get("DOI", "")
publication.citation.append(referenced_publication)

return publication

except requests.exceptions.Timeout as ex:
logger.error(f'Timed out Exception: {str(ex)}')

except Exception as ex:
logger.error(f'Exception: {str(ex)}')
logger.error(traceback.format_exc())
Loading

0 comments on commit 8fa8a49

Please sign in to comment.