Skip to content

Commit

Permalink
Improved performance for email address detection (#132)
Browse files Browse the repository at this point in the history
* Add new email address test cases

* Improve email address detection by 2x by default, 4x without strict matching
  • Loading branch information
RobDickinson authored Aug 25, 2024
1 parent c2d32c0 commit c489ca8
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ public List<Filter> getFiltersForPolicy(final Policy policy, final Map<String, M
.withWindowSize(phileasConfiguration.spanWindowSize())
.build();

final Filter filter = new EmailAddressFilter(filterConfiguration);
final boolean isStrict = policy.getIdentifiers().getEmailAddress().isOnlyStrictMatches();

final Filter filter = new EmailAddressFilter(filterConfiguration, isStrict);
enabledFilters.add(filter);
filterCache.get(policy.getName()).put(FilterType.EMAIL_ADDRESS, filter);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@

public class EmailAddressFilter extends RegexFilter {

public EmailAddressFilter(FilterConfiguration filterConfiguration) {
public EmailAddressFilter(FilterConfiguration filterConfiguration, boolean onlyStrictMatches) {
super(FilterType.EMAIL_ADDRESS, filterConfiguration);

final Pattern emailAddressPattern = Pattern.compile("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", Pattern.CASE_INSENSITIVE);
final Pattern emailAddressPattern = onlyStrictMatches
? Pattern.compile("\\b(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\b])", Pattern.CASE_INSENSITIVE)
: Pattern.compile("\\b[\\w.-]+?@(?:([a-zA-Z\\d\\-])+?\\.)+(?:[a-zA-Z\\d]{2,4})+\\b");

final FilterPattern email1 = new FilterPattern.FilterPatternBuilder(emailAddressPattern, 0.90).build();

this.contextualTerms = new HashSet<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.policy.filters.strategies.rules.EmailAddressFilterStrategy;
import ai.philterd.phileas.model.services.AlertService;
import ai.philterd.phileas.services.anonymization.AlphanumericAnonymizationService;
Expand All @@ -35,7 +36,7 @@ public class EmailAddressFilterTest extends AbstractFilterTest {
private final AlertService alertService = Mockito.mock(AlertService.class);

@Test
public void filterEmail() throws Exception {
public void filterEmailStrict() throws Exception {

final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder()
.withStrategies(List.of(new EmailAddressFilterStrategy()))
Expand All @@ -44,13 +45,84 @@ public void filterEmail() throws Exception {
.withWindowSize(windowSize)
.build();

final EmailAddressFilter filter = new EmailAddressFilter(filterConfiguration);
filterEmails(filterConfiguration, true);

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "my email is [email protected].", attributes);
}

@Test
public void filterEmailRelaxed() throws Exception {

final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder()
.withStrategies(List.of(new EmailAddressFilterStrategy()))
.withAlertService(alertService)
.withAnonymizationService(new AlphanumericAnonymizationService(new LocalAnonymizationCacheService()))
.withWindowSize(windowSize)
.build();

filterEmails(filterConfiguration, false);

}

private void filterEmails(FilterConfiguration filterConfiguration, boolean onlyStrictMatches) throws Exception {

final String cxt = "context";
final String doc = "documentid";
final EmailAddressFilter filter = new EmailAddressFilter(filterConfiguration, onlyStrictMatches);
final Policy policy = getPolicy();

final FilterResult filterResult = filter.filter(policy, cxt, doc, PIECE, "my email is [email protected].", attributes);
Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertTrue(checkSpan(filterResult.getSpans().get(0), 12, 25, FilterType.EMAIL_ADDRESS));
Assertions.assertEquals("[email protected]", filterResult.getSpans().get(0).getText());

// 👇 cases adapted from https://www.tumblr.com/codefool/15288874550/list-of-valid-and-invalid-email-addresses

// valid email addresses
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "very.unusual.“@”[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "very.“(),:;<>[]”.VERY.“very@\\\\ \"very”[email protected]", attributes).getSpans().size());
//Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "“email”@example.com", attributes).getSpans().size()); // todo include quotes
//Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "much.“more\\ unusual”@example.com", attributes).getSpans().size()); // todo include quotes

// valid email addresses only detected with strict matching
Assertions.assertEquals(onlyStrictMatches ? 1 : 0, filter.filter(policy, cxt, doc, PIECE, "email@[123.123.123.123]", attributes).getSpans().size());

// invalid email addresses
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "plainaddress", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "#@%^%#$@#$@#.com", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "@example.com", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "email.example.com", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "あいうえお@example.com", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "email@example", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "“(),:;<>[\\]@example.com", attributes).getSpans().size());
//Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); // todo detect invalid TLD
//Assertions.assertEquals(0, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size()); // todo detect invalid TLD

// invalid email addresses only rejected with strict matching
Assertions.assertEquals(onlyStrictMatches ? 0 : 1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(onlyStrictMatches ? 0 : 1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());

// valid partial matches against invalid email addresses
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "Joe Smith <[email protected]>", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "email@[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected] (Joe Smith)", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "just\"not\"[email protected]", attributes).getSpans().size());
Assertions.assertEquals(1, filter.filter(policy, cxt, doc, PIECE, "this\\ is\"really\"not\\[email protected]", attributes).getSpans().size());

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@

public class EmailAddress extends AbstractFilter {

@SerializedName("onlyStrictMatches")
@Expose
protected boolean onlyStrictMatches = true;

@SerializedName("emailAddressFilterStrategies")
@Expose
private List<EmailAddressFilterStrategy> emailAddressFilterStrategies;
Expand All @@ -35,4 +39,12 @@ public void setEmailAddressFilterStrategies(List<EmailAddressFilterStrategy> ema
this.emailAddressFilterStrategies = emailAddressFilterStrategies;
}

public boolean isOnlyStrictMatches() {
return onlyStrictMatches;
}

public void setOnlyStrictMatches(boolean value) {
onlyStrictMatches = value;
}

}

0 comments on commit c489ca8

Please sign in to comment.