Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Control list filters #323

Open
wants to merge 43 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
7034e27
Revert "Add more details about the lemmatizer at the new corpus phase…
Juliettejns Mar 21, 2024
6e202f0
ajout filtres de correction token
Juliettejns Jun 13, 2024
49016d0
Revert "ajout filtres de correction token"
Juliettejns Jun 13, 2024
4308ecc
ajout fonction filtres tokens à corriger
Juliettejns Jun 13, 2024
09768ec
Add more details about the lemmatizer at the new corpus phase. (#317)
PonteIneptique Mar 20, 2024
0acb09d
ajout filtres modifications de tokens + début affichage + templates c…
Juliettejns Jun 24, 2024
04d080c
suppression anciennes modifications
Juliettejns Jun 24, 2024
61f5957
ajout filtres token invalid + lien corpus
Juliettejns Jun 25, 2024
9d5844f
correction Ignore values des listes de menus dans Control Lists
Juliettejns Jun 26, 2024
f5517e9
correction tests corpus_init//fonction get_unallowed
Juliettejns Jun 27, 2024
b5a2355
correction erreurs tests - bug new corpus
Juliettejns Jun 28, 2024
eaf0822
suppression commentaire test
Juliettejns Jun 28, 2024
797d484
ajouts premiers jets tests
Juliettejns Jul 1, 2024
006fca8
correction test filter update
Juliettejns Jul 2, 2024
2b11ebc
correction test registration corpus filter
Juliettejns Jul 2, 2024
ee34646
ajout test edit token with filter
Juliettejns Jul 2, 2024
0fe9659
modif aggrandissement varchar models corpus + print logs
Juliettejns Jul 9, 2024
f97e432
test ajout création user pour CL filter
Juliettejns Jul 12, 2024
b8bbd51
test bug control filters - ajout users"
Juliettejns Jul 12, 2024
89e9aae
tests CL - modification find element by ID > NAME
Juliettejns Aug 27, 2024
666ff6d
find element by ID>NAME
Juliettejns Aug 27, 2024
56bac05
Correct tests and clean up the way regex are applied (#329)
PonteIneptique Aug 27, 2024
a2fb107
changement filtres CLS ControlListUser>controlList
Juliettejns Aug 28, 2024
8e0ef97
suppression ajout count
Juliettejns Aug 28, 2024
aa1fb7d
modifications corpus.id => self.id + get_unallowed attributes
Juliettejns Aug 28, 2024
dcd28a6
Adding tests back to control list for changing filter
PonteIneptique Sep 3, 2024
41039bd
ajout test base filtre
Juliettejns Sep 3, 2024
1c9e66c
Creating combinatory tests
PonteIneptique Sep 3, 2024
dbf15fa
ajout filtre test combinaison assert + modif filtre ponctuation
Juliettejns Sep 3, 2024
775f7f4
ajout filtre none
Juliettejns Sep 3, 2024
f76e9a1
Fix a condition on lemma
PonteIneptique Sep 3, 2024
f4e925c
Better message
PonteIneptique Sep 3, 2024
06332f7
modif test regex ajout condition spé Sans test
Juliettejns Sep 9, 2024
d482cf3
modif filtre metadata sur form et non lemma + correction unallowed
Juliettejns Sep 11, 2024
1f4b80a
modif test filtres avec metadata
Juliettejns Sep 11, 2024
e00e28f
suppression user_id des appels de get_unallowed
Juliettejns Sep 11, 2024
71a920d
ajout choix unallowed sqlite ou posgtresé
Juliettejns Sep 11, 2024
11caf39
ajout diff sqlite/postgres pour filtres get_unallowed
Juliettejns Sep 11, 2024
668fd90
déplacement logging
Juliettejns Sep 11, 2024
fbabbcf
deplacement logging
Juliettejns Sep 12, 2024
c6c0a15
Change the way the control list filter view is shown
PonteIneptique Sep 17, 2024
db60b6a
modif metadata validity + tests + presentation filtres CL dans inform…
Juliettejns Sep 17, 2024
360a560
Delete tests/test_selenium/download_temp/wauchier.xml
Juliettejns Sep 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion app/control_lists/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


from app.main.views.utils import render_template_with_nav_info
from app.models import ControlLists, AllowedLemma, WordToken, User, PublicationStatus, CorpusCustomDictionary
from app.models import ControlLists, ControlListsUser, AllowedLemma, WordToken, User, PublicationStatus, CorpusCustomDictionary
from app import db, email
from ..utils import PyrrhaError
from ..utils.forms import strip_or_none
Expand Down Expand Up @@ -374,3 +374,36 @@ def information_edit(control_list_id, control_list):
def information_read(control_list_id):
control_list, is_owner = ControlLists.get_linked_or_404(control_list_id=control_list_id, user=current_user)
return render_template_with_nav_info('control_lists/information_read.html', control_list=control_list)


@control_lists_bp.route("/controls/<int:control_list_id>/ignore_terms", methods=["POST", "GET"])
@login_required
def ignore_terms_filter(control_list_id):
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
current_controlList = ControlLists.query.filter_by(**{"id":control_list_id}).first_or_404()
print(current_controlList.filter_punct, current_controlList.filter_ignore)
list_filter = []
if request.method == "POST":
list_filter.append(request.form.get("punct"))
list_filter.append(request.form.get("numeral"))
list_filter.append(request.form.get('ignore'))
list_filter.append(request.form.get('metadata'))
filtered_filter = []
for el in list_filter:
if el is not None:
filtered_filter.append(el)

current_controlList.filter_punct = 'punct' in filtered_filter
current_controlList.filter_metadata = 'metadata' in filtered_filter
current_controlList.filter_numeral = 'numeral' in filtered_filter
current_controlList.filter_ignore = 'ignore' in filtered_filter
db.session.add(current_controlList)
db.session.commit()


flash('The filters have been updated.', 'success')
current_controlList = ControlLists.query.filter_by(**{"id":control_list_id}).first_or_404()
print(current_controlList.filter_punct, current_controlList.filter_ignore)
return render_template_with_nav_info('control_lists/ignore_filter.html', control_list_id=control_list_id,
current_control_list=current_controlList)

return render_template_with_nav_info('control_lists/ignore_filter.html', control_list_id=control_list_id, current_control_list=current_controlList)
26 changes: 24 additions & 2 deletions app/main/views/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sqlalchemy.exc
from sqlalchemy import func, distinct, text
from typing import List
import logging


from app import db
Expand Down Expand Up @@ -36,12 +37,14 @@ def _get_available():
lists[cl.str_public].append(cl)
return lists


@main.route('/corpus/new', methods=["POST", "GET"])
@login_required
def corpus_new():
""" Register a new corpus
"""
logging.basicConfig(filename='./pyrrha_corpus_creation.log', level=logging.DEBUG,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

La basic config devrait avoir lieu ailleurs je pense... Par exemple, dans create_app

format='%(asctime)s %(levelname)s %(name)s %(message)s')
logger = logging.getLogger(__name__)
lemmatizers = current_app.config.get("LEMMATIZERS", [])

def normal_view():
Expand Down Expand Up @@ -93,8 +96,9 @@ def error():
tokens = read_input_tokens(request.form.get("tsv"))
try:
control_list = ControlLists.query.get_or_404(request.form.get("control_list_select"))
except Exception:
except Exception as e:
flash("This control list does not exist", category="error")
logger.error(e)
return error()
form_kwargs.update({"word_tokens_dict": tokens,
"control_list": control_list})
Expand All @@ -110,17 +114,32 @@ def error():
form_kwargs.update({"word_tokens_dict": tokens, "allowed_lemma": allowed_lemma,
"allowed_POS": allowed_POS, "allowed_morph": allowed_morph})

list_filter = []
list_filter.append(request.form.get("punct"))
list_filter.append(request.form.get("numeral"))
list_filter.append(request.form.get("ignore"))
list_filter.append(request.form.get("metadata"))
list_filter = [flt for flt in list_filter if flt]

try:
corpus: Corpus = Corpus.create(**form_kwargs)
db.session.add(CorpusUser(corpus=corpus, user=current_user, is_owner=True))
# Add a link to the control list
ControlLists.link(corpus.control_lists_id, current_user.id, is_owner=cl_owner)
db.session.commit()
current_controlList = ControlLists.query.filter_by(**{"id":corpus.control_lists_id}).first_or_404()
current_controlList.filter_punct = 'punct' in list_filter
current_controlList.filter_metadata = 'metadata' in list_filter
current_controlList.filter_numeral = 'numeral' in list_filter
current_controlList.filter_ignore = 'ignore' in list_filter
db.session.commit()
flash("New corpus registered", category="success")
except (sqlalchemy.exc.StatementError, sqlalchemy.exc.IntegrityError) as e:
print(e)
db.session.rollback()
flash("The corpus cannot be registered. Check your data", category="error")
flash(str(e.orig).lower())
logger.error(e)
if db.session.get_bind().dialect.name == "postgresql":
unique_constraint = 'duplicate key value violates unique constraint "corpus_name_key"'
else:
Expand All @@ -133,6 +152,7 @@ def error():
db.session.rollback()
flash("At least one line of your corpus is missing a token/form. Check line %s " % exc.line,
category="error")
logger.error(exc)
return error()
except NoTokensInput:
db.session.rollback()
Expand All @@ -143,8 +163,10 @@ def error():
flash(exception, category="error")
return error()
except Exception as e:
print(e)
db.session.rollback()
flash("The corpus cannot be registered. Check your data", category="error")
logger.error(e)
return error()
return redirect(url_for(".corpus_get", corpus_id=corpus.id))

Expand Down
4 changes: 3 additions & 1 deletion app/main/views/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ def tokens_correct_unallowed(corpus_id, allowed_type):
:param allowed_type: Type of allowed value to check agains (lemma, POS, morph)
"""
corpus = Corpus.query.filter_by(**{"id": corpus_id}).first()
user_id = current_user.id
tokens = corpus\
.get_unallowed(allowed_type)\
.get_unallowed(user_id, corpus_id, allowed_type)\
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
.paginate(
page=int_or(request.args.get("page"), 1),
per_page=int_or(request.args.get("limit"), current_app.config["PAGINATION_DEFAULT_TOKENS"])
Expand All @@ -67,6 +68,7 @@ def tokens_correct_unallowed(corpus_id, allowed_type):
)



@main.route('/corpus/<int:corpus_id>/tokens/changes/similar/<int:record_id>')
@login_required
@requires_corpus_access("corpus_id")
Expand Down
25 changes: 23 additions & 2 deletions app/models/control_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from collections import Counter
# PIP Packages
import unidecode
import regex as re
import yaml
from flask_sqlalchemy.query import Query as FlaskQuery
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.orm import backref
from sqlalchemy.orm import backref, Query
from sqlalchemy import literal, case
from werkzeug.exceptions import BadRequest
# APP Logic
Expand Down Expand Up @@ -43,6 +45,15 @@ class ControlLists(db.Model):
bibliography = db.Column(db.Text, nullable=True)
language = db.Column(db.String(10), nullable=True)
notes = db.Column(db.Text, nullable=True)
filter_punct = db.Column(db.Boolean, unique=False, default=False)
filter_numeral = db.Column(db.Boolean, unique=False, default=False)
filter_metadata = db.Column(db.Boolean, unique=False, default=False)
filter_ignore = db.Column(db.Boolean, unique=False, default=False)

re_filter_metadata = r'(\[[^\]]+:[^\]]*\]$)'
re_filter_ignore = r'(^\[IGNORE\])'
re_filter_punct = "(^[^\w\s]$)"
re_filter_numeral = r'(^\d+$)'

# For caching purposes, we record the last time these fields were edited
#last_lemma_edit = db.Column(db.DateTime, default=datetime.datetime.utcnow)
Expand Down Expand Up @@ -240,6 +251,7 @@ def has_list(self, allowed_type):
).exists()
).scalar()


@staticmethod
def add_default_lists(path=None):
""" Loads the default lists from the config folder
Expand All @@ -255,7 +267,7 @@ def add_default_lists(path=None):
print("[ControlLists] Adding %s " % data["name"])
cl = ControlLists(**data, public=PublicationStatus.public)
db.session.add(cl)
db.session.flush() # Get the AutoIncrement ID
db.session.flush() # Get the AutoIncrement ID/home/jjanes
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
configs = [
("lemma.txt", AllowedLemma, read_input_lemma),
("POS.txt", AllowedPOS, read_input_POS),
Expand All @@ -281,10 +293,19 @@ class ControlListsUser(db.Model):
user_id = db.Column(db.Integer, db.ForeignKey(User.id), primary_key=True)
is_owner = db.Column(db.Boolean, default=False)


control = db.relationship("ControlLists", backref=backref("control_lists_user", cascade="all, delete-orphan"))
user = db.relationship(User, backref=backref("control_lists_user", cascade="all, delete-orphan"))



@classmethod
def retrieve(cls, user_id: int, control_list_id: int) -> FlaskQuery:
return cls.query.filter(db.and_(
cls.user_id == user_id,
cls.control_lists_id == control_list_id
))

class AllowedLemma(db.Model):
""" An allowed lemma is a lemma that is accepted

Expand Down
79 changes: 62 additions & 17 deletions app/models/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# PIP Packages
import unidecode
import sqlalchemy.exc
import regex as re
from sqlalchemy.ext.associationproxy import association_proxy
from sqlalchemy.orm import backref
from sqlalchemy import func, literal, not_, or_, and_
Expand All @@ -23,7 +24,7 @@

# Models
from .user import User
from .control_lists import ControlLists, AllowedPOS, AllowedMorph, AllowedLemma, PublicationStatus
from .control_lists import ControlLists, ControlListsUser, AllowedPOS, AllowedMorph, AllowedLemma, PublicationStatus


from collections import namedtuple
Expand Down Expand Up @@ -297,7 +298,7 @@ def get_allowed_values(self, allowed_type="lemma", label=None, order_by="label",
).order_by(order_by)
return db.session.query(cls).filter(cls.control_list == self.control_lists_id).order_by(order_by)

def get_unallowed(self, allowed_type="lemma"):
def get_unallowed(self, user_id, corpus_id, allowed_type="lemma"):
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
""" Search for WordToken that would not comply with Allowed Values (in AllowedLemma,
AllowedPOS, AllowedMorph) nor with a corpus custom dictionary

Expand Down Expand Up @@ -326,12 +327,33 @@ def get_unallowed(self, allowed_type="lemma"):
CorpusCustomDictionary.category == allowed_type,
CorpusCustomDictionary.label == prop
)

list_darguments = [
WordToken.corpus == self.id,
not_(allowed.exists()),
not_(custom_dict.exists())
]

current_corpus = Corpus.query.filter_by(**{"id":corpus_id}).first_or_404()
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
current_controlList = ControlLists.query.filter_by(
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
**{"id":current_corpus.control_lists_id}).first_or_404()

regex_liste = []
if current_controlList:
if current_controlLists.filter_metadata:
regex_liste.append(ControlLists.re_filter_metadata)
if current_controlList.filter_ignore:
regex_liste.append(ControlLists.re_filter_ignore)
if current_controlList.filter_punct:
regex_liste.append(ControlLists.re_filter_punct)
if current_controlList.filter_numeral:
regex_liste.append(ControlLists.re_filter_numeral)

if regex_liste:
list_darguments.append(WordToken.form.op('~')("".join(regex_liste)))

return db.session.query(WordToken).filter(
db.and_(
WordToken.corpus == self.id,
not_(allowed.exists()),
not_(custom_dict.exists())
)
db.and_(*list_darguments)
).order_by(WordToken.order_id)

@property
Expand Down Expand Up @@ -437,6 +459,8 @@ def create(
)
db.session.add(c)
db.session.commit()


except (sqlalchemy.exc.StatementError, sqlalchemy.exc.IntegrityError) as e:
db.session.rollback()
raise e
Expand Down Expand Up @@ -1097,13 +1121,35 @@ def is_valid(lemma, POS, morph, corpus):
}

allowed_column = corpus.displayed_columns_by_name

if lemma is not None \
and "lemma" in allowed_column \
and allowed_lemma.count() > 0 \
and corpus.get_allowed_values("lemma", label=lemma).count() == 0:
if not corpus.has_custom_dictionary_value("lemma", lemma):
statuses["lemma"] = False
print("test")
if lemma and "lemma" in allowed_column and corpus.get_allowed_values("lemma", label=lemma).count() == 0:
print("test2")
current_controlList = ControlLists.query.filter_by(**{"id":corpus.control_lists_id}).first_or_404()
Juliettejns marked this conversation as resolved.
Show resolved Hide resolved
print(current_controlList)
regex_liste = []
if current_controlList:
if current_controlList.filter_metadata:
regex_liste.append(ControlLists.re_filter_metadata)
if current_controlList.filter_ignore:
regex_liste.append(ControlLists.re_filter_ignore)
if current_controlList.filter_punct:
regex_liste.append(ControlLists.re_filter_punct)
if current_controlList.filter_numeral:
regex_liste.append(ControlLists.re_filter_numeral)
print(regex_liste)

ignored_by_regex = False

for regex in regex_liste:
if re.match(regex, lemma) is not None:
ignored_by_regex = True

if (
not ignored_by_regex and
corpus.has_custom_dictionary_value("lemma", lemma) is False and
corpus.get_allowed_values("lemma", label=lemma).count() == 0
):
statuses["lemma"] = False

if POS is not None \
and "POS" in allowed_column \
Expand All @@ -1118,6 +1164,7 @@ def is_valid(lemma, POS, morph, corpus):
and corpus.get_allowed_values("morph", label=morph).count() == 0:
if not corpus.has_custom_dictionary_value("morph", morph):
statuses["morph"] = False

return statuses

@staticmethod
Expand Down Expand Up @@ -1305,7 +1352,6 @@ def update(user_id, corpus_id, token_id, lemma=None, POS=None, morph=None):
error = WordToken.NothingChangedError("No value where changed")
error.msg = "No value where changed"
raise error

# Check if values are correct regarding allowed values
validity = WordToken.is_valid(lemma=lemma, POS=POS, morph=morph, corpus=corpus)
if False in list(validity.values()):
Expand All @@ -1317,7 +1363,6 @@ def update(user_id, corpus_id, token_id, lemma=None, POS=None, morph=None):
error.statuses = validity
error.invalid_columns = [key for key in validity.keys() if validity[key] is False]
raise error

# Updating
if not lemma:
lemma = token.lemma
Expand Down Expand Up @@ -1442,7 +1487,7 @@ def get_nearly_similar_to(token, mode):
WordToken.id != token.id,
*filtering
)
)
)


class TokenHistory(db.Model):
Expand Down
Loading
Loading