-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improved performance for email address detection (#132)
* Add new email address test cases * Improve email address detection by 2x by default, 4x without strict matching
- Loading branch information
1 parent
c2d32c0
commit c489ca8
Showing
4 changed files
with
95 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
import ai.philterd.phileas.model.enums.FilterType; | ||
import ai.philterd.phileas.model.filter.FilterConfiguration; | ||
import ai.philterd.phileas.model.objects.FilterResult; | ||
import ai.philterd.phileas.model.policy.Policy; | ||
import ai.philterd.phileas.model.policy.filters.strategies.rules.EmailAddressFilterStrategy; | ||
import ai.philterd.phileas.model.services.AlertService; | ||
import ai.philterd.phileas.services.anonymization.AlphanumericAnonymizationService; | ||
|
@@ -35,7 +36,7 @@ public class EmailAddressFilterTest extends AbstractFilterTest { | |
private final AlertService alertService = Mockito.mock(AlertService.class); | ||
|
||
@Test | ||
public void filterEmail() throws Exception { | ||
public void filterEmailStrict() throws Exception { | ||
|
||
final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder() | ||
.withStrategies(List.of(new EmailAddressFilterStrategy())) | ||
|
@@ -44,13 +45,84 @@ public void filterEmail() throws Exception { | |
.withWindowSize(windowSize) | ||
.build(); | ||
|
||
final EmailAddressFilter filter = new EmailAddressFilter(filterConfiguration); | ||
filterEmails(filterConfiguration, true); | ||
|
||
final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "my email is [email protected].", attributes); | ||
} | ||
|
||
@Test | ||
public void filterEmailRelaxed() throws Exception { | ||
|
||
final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder() | ||
.withStrategies(List.of(new EmailAddressFilterStrategy())) | ||
.withAlertService(alertService) | ||
.withAnonymizationService(new AlphanumericAnonymizationService(new LocalAnonymizationCacheService())) | ||
.withWindowSize(windowSize) | ||
.build(); | ||
|
||
filterEmails(filterConfiguration, false); | ||
|
||
} | ||
|
||
private void filterEmails(FilterConfiguration filterConfiguration, boolean onlyStrictMatches) throws Exception { | ||
|
||
final String cxt = "context"; | ||
final String doc = "documentid"; | ||
final EmailAddressFilter filter = new EmailAddressFilter(filterConfiguration, onlyStrictMatches); | ||
final Policy policy = getPolicy(); | ||
|
||
final FilterResult filterResult = filter.filter(policy, cxt, doc, PIECE, "my email is [email protected].", attributes); | ||
Assertions.assertEquals(1, filterResult.getSpans().size()); | ||
Assertions.assertTrue(checkSpan(filterResult.getSpans().get(0), 12, 25, FilterType.EMAIL_ADDRESS)); | ||
Assertions.assertEquals("[email protected]", filterResult.getSpans().get(0).getText()); | ||
|
||
// 👇 cases adapted from https://www.tumblr.com/codefool/15288874550/list-of-valid-and-invalid-email-addresses | ||
|
||
// valid email addresses | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "very.unusual.“@”[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "very.“(),:;<>[]”.VERY.“very@\\\\ \"very”[email protected]", attributes).getSpans().size()); | ||
//Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "“email”@example.com", attributes).getSpans().size()); // todo include quotes | ||
//Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "much.“more\\ unusual”@example.com", attributes).getSpans().size()); // todo include quotes | ||
|
||
// valid email addresses only detected with strict matching | ||
Assertions.assertEquals(onlyStrictMatches ? 1 : 0, filter.filter(policy, cxt, doc, PIECE, "email@[123.123.123.123]", attributes).getSpans().size()); | ||
|
||
// invalid email addresses | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "plainaddress", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "#@%^%#$@#$@#.com", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "@example.com", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "email.example.com", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "あいうえお@example.com", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "email@example", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "“(),:;<>[\\]@example.com", attributes).getSpans().size()); | ||
//Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); // todo detect invalid TLD | ||
//Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); // todo detect invalid TLD | ||
|
||
// invalid email addresses only rejected with strict matching | ||
Assertions.assertEquals(onlyStrictMatches ? 0 : 1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(onlyStrictMatches ? 0 : 1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
|
||
// valid partial matches against invalid email addresses | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "Joe Smith <[email protected]>", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "email@[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected] (Joe Smith)", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "just\"not\"[email protected]", attributes).getSpans().size()); | ||
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "this\\ is\"really\"not\\[email protected]", attributes).getSpans().size()); | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters