Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes in XML parser to allow partial content #189

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,27 @@ public class FlagUpdater implements Updater<SequenceType, ProteinDescription> {

@Override
public ProteinDescription fromXml(ProteinDescription modelObject, SequenceType xmlObject) {
FlagType fType = Optional.ofNullable(modelObject.getFlag()).map(Flag::getType).orElse(null);
String frag = xmlObject.getFragment();
if (xmlObject.isPrecursor() != null && xmlObject.isPrecursor()) {
if (SINGLE.equals(frag)) {
fType = FlagType.FRAGMENT_PRECURSOR;
ProteinDescriptionBuilder result = ProteinDescriptionBuilder.from(modelObject);
if (xmlObject != null) {
FlagType fType =
Optional.ofNullable(modelObject.getFlag()).map(Flag::getType).orElse(null);
String frag = xmlObject.getFragment();
if (xmlObject.isPrecursor() != null && xmlObject.isPrecursor()) {
if (SINGLE.equals(frag)) {
fType = FlagType.FRAGMENT_PRECURSOR;
} else if (MULTIPLE.equals(frag)) {
fType = FlagType.FRAGMENTS_PRECURSOR;
} else {
fType = FlagType.PRECURSOR;
}
} else if (SINGLE.equals(frag)) {
fType = FlagType.FRAGMENT;
} else if (MULTIPLE.equals(frag)) {
fType = FlagType.FRAGMENTS_PRECURSOR;
} else {
fType = FlagType.PRECURSOR;
fType = FlagType.FRAGMENTS;
}
} else if (SINGLE.equals(frag)) {
fType = FlagType.FRAGMENT;
} else if (MULTIPLE.equals(frag)) {
fType = FlagType.FRAGMENTS;
result.flag(fType);
}
return ProteinDescriptionBuilder.from(modelObject).flag(fType).build();
return result.build();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import org.uniprot.core.uniprotkb.taxonomy.Organism;
import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder;
import org.uniprot.core.util.Utils;
import org.uniprot.core.xml.Converter;
import org.uniprot.core.xml.jaxb.uniprot.ObjectFactory;
import org.uniprot.core.xml.jaxb.uniprot.OrganismType;
Expand All @@ -24,10 +25,18 @@ public OrganismConverter(EvidenceIndexMapper evRefMapper, ObjectFactory xmlUnipr
@Override
public Organism fromXml(OrganismType xmlObj) {
OrganismBuilder builder = new OrganismBuilder();
builder.taxonId(Long.parseLong(xmlObj.getDbReference().get(0).getId()));
OrganismConverterUtil.updateOrganismNameFromXml(xmlObj.getName(), builder);
builder.lineagesSet(xmlObj.getLineage().getTaxon());
builder.evidencesSet(evRefMapper.parseEvidenceIds(xmlObj.getEvidence()));
if (xmlObj != null) {
if (Utils.notNullNotEmpty(xmlObj.getDbReference())) {
builder.taxonId(Long.parseLong(xmlObj.getDbReference().get(0).getId()));
}
if (Utils.notNullNotEmpty(xmlObj.getName())) {
OrganismConverterUtil.updateOrganismNameFromXml(xmlObj.getName(), builder);
}
if (xmlObj.getLineage() != null) {
builder.lineagesSet(xmlObj.getLineage().getTaxon());
}
builder.evidencesSet(evRefMapper.parseEvidenceIds(xmlObj.getEvidence()));
}
return builder.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ public SequenceConverter(ObjectFactory xmlUniprotFactory) {

@Override
public Sequence fromXml(SequenceType xmlObj) {
String sequence = xmlObj.getValue();
String sequence = "";
if (xmlObj != null) {
sequence = xmlObj.getValue();
}
// sequence = sequence.replaceAll(" ", "");
return new SequenceBuilder(sequence).build();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.uniprot.core.uniprotkb.impl.EntryAuditBuilder;
import org.uniprot.core.uniprotkb.impl.UniProtKBAccessionBuilder;
import org.uniprot.core.uniprotkb.impl.UniProtKBEntryBuilder;
import org.uniprot.core.util.Utils;
import org.uniprot.core.xml.Converter;
import org.uniprot.core.xml.jaxb.uniprot.*;
import org.uniprot.core.xml.uniprot.citation.ReferenceConverter;
Expand Down Expand Up @@ -166,30 +167,32 @@ private SequenceType toXmlForSequence(UniProtKBEntry entry) {
// ..with multiple interactions.
private List<Comment> fromXmlForComments(Entry xmlEntry) {
List<Comment> uniComments = new ArrayList<>();
List<org.uniprot.core.xml.jaxb.uniprot.CommentType> comments = xmlEntry.getComment();
List<org.uniprot.core.xml.jaxb.uniprot.CommentType> interactionComment =
comments.stream()
.filter(val -> val.getType().equals(INTERACTION))
.collect(Collectors.toList());
if (Utils.notNullNotEmpty(xmlEntry.getComment())) {
List<org.uniprot.core.xml.jaxb.uniprot.CommentType> comments = xmlEntry.getComment();
List<org.uniprot.core.xml.jaxb.uniprot.CommentType> interactionComment =
comments.stream()
.filter(val -> val.getType().equals(INTERACTION))
.collect(Collectors.toList());

boolean interactionsFirst = true;
for (org.uniprot.core.xml.jaxb.uniprot.CommentType commentType : comments) {
if (commentType.getType().equals(INTERACTION)) {
if (interactionsFirst) {
interactionsFirst = false;
boolean interactionsFirst = true;
for (org.uniprot.core.xml.jaxb.uniprot.CommentType commentType : comments) {
if (commentType.getType().equals(INTERACTION)) {
if (interactionsFirst) {
interactionsFirst = false;
uniComments.add(
CommentConverterFactory.INSTANCE
.createInteractionCommentConverter(this.xmlUniprotFactory)
.fromXml(interactionComment));
}
} else {
org.uniprot.core.uniprotkb.comment.CommentType type =
org.uniprot.core.uniprotkb.comment.CommentType.typeOf(
commentType.getType());
uniComments.add(
CommentConverterFactory.INSTANCE
.createInteractionCommentConverter(this.xmlUniprotFactory)
.fromXml(interactionComment));
.createCommentConverter(type, evRefMapper, xmlUniprotFactory)
.fromXml(commentType));
}
} else {
org.uniprot.core.uniprotkb.comment.CommentType type =
org.uniprot.core.uniprotkb.comment.CommentType.typeOf(
commentType.getType());
uniComments.add(
CommentConverterFactory.INSTANCE
.createCommentConverter(type, evRefMapper, xmlUniprotFactory)
.fromXml(commentType));
}
}
return uniComments;
Expand Down Expand Up @@ -233,22 +236,42 @@ private UniProtKBEntryBuilder createUniprotEntryBuilderFromXml(Entry xmlEntry) {
List<String> accessions = xmlEntry.getAccession();
return new UniProtKBEntryBuilder(
accessions.get(0),
xmlEntry.getName().get(0),
getUniProtId(xmlEntry),
UniProtKBEntryType.typeOf(xmlEntry.getDataset()))
.proteinExistence(ProteinExistence.typeOf(xmlEntry.getProteinExistence().getType()))
.proteinExistence(getProteinExistence(xmlEntry))
.secondaryAccessionsSet(
accessions.subList(1, accessions.size()).stream()
.map(sec -> new UniProtKBAccessionBuilder(sec).build())
.collect(Collectors.toList()))
.entryAudit(entryAuditFromXml(xmlEntry));
}

private ProteinExistence getProteinExistence(Entry xmlEntry) {
ProteinExistence proteinExistence = ProteinExistence.UNKNOWN;
if (xmlEntry.getProteinExistence() != null) {
proteinExistence = ProteinExistence.typeOf(xmlEntry.getProteinExistence().getType());
}
return proteinExistence;
}

private String getUniProtId(Entry xmlEntry) {
String uniProtId = "";
if (Utils.notNullNotEmpty(xmlEntry.getName())) {
uniProtId = xmlEntry.getName().get(0);
}
return uniProtId;
}

private EntryAudit entryAuditFromXml(Entry xmlEntry) {
int version = xmlEntry.getVersion();
LocalDate firstPublic = XmlConverterHelper.dateFromXml(xmlEntry.getCreated());
LocalDate lastUpdated = XmlConverterHelper.dateFromXml(xmlEntry.getModified());
int seqVersion = xmlEntry.getSequence().getVersion();
LocalDate seqDate = XmlConverterHelper.dateFromXml(xmlEntry.getSequence().getModified());
int seqVersion = 0;
LocalDate seqDate = null;
if (xmlEntry.getSequence() != null) {
seqVersion = xmlEntry.getSequence().getVersion();
seqDate = XmlConverterHelper.dateFromXml(xmlEntry.getSequence().getModified());
}
return new EntryAuditBuilder()
.firstPublic(firstPublic)
.lastAnnotationUpdate(lastUpdated)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.uniprot.core.xml;

import static org.junit.jupiter.api.Assertions.assertNotNull;

import java.io.InputStream;
import java.time.LocalDate;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Marshaller;
import javax.xml.bind.Unmarshaller;

import org.junit.jupiter.api.Test;
import org.uniprot.core.flatfile.writer.impl.UniProtFlatfileWriter;
import org.uniprot.core.uniprotkb.UniProtKBEntry;
import org.uniprot.core.uniprotkb.impl.EntryAuditBuilder;
import org.uniprot.core.uniprotkb.impl.UniProtKBEntryBuilder;
import org.uniprot.core.xml.jaxb.uniprot.Uniprot;
import org.uniprot.core.xml.uniprot.UniProtEntryConverter;

class ConverterXMLToFFTest {
// XSD -->
// https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot.xsd

@Test
void testXMLToFF() throws Exception {
String file = "/google/entry_v4_modified.xml";
InputStream inputStream = ConverterXMLToFFTest.class.getResourceAsStream(file);
JAXBContext jaxbContext = JAXBContext.newInstance("org.uniprot.core.xml.jaxb.uniprot");

Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
Uniprot xmlEntry = (Uniprot) jaxbUnmarshaller.unmarshal(inputStream);
assertNotNull(xmlEntry);

UniProtEntryConverter converter = new UniProtEntryConverter();
UniProtKBEntry uniprotEntry =
xmlEntry.getEntry().stream().map(converter::fromXml).findFirst().orElse(null);
assertNotNull(uniprotEntry);

UniProtKBEntry auditedEntry =
UniProtKBEntryBuilder.from(uniprotEntry)
.entryAudit(
new EntryAuditBuilder()
.firstPublic(LocalDate.now())
.lastAnnotationUpdate(LocalDate.now())
.lastSequenceUpdate(LocalDate.now())
.build())
.build();

// "XML FILE" --> "FF PARTIAL" (Curator can use to validate)

String ffResult = UniProtFlatfileWriter.write(auditedEntry);
assertNotNull(ffResult);
System.out.println(ffResult);
}

protected Marshaller createMarshaller(JAXBContext jaxbContext) {
try {
Marshaller contextMarshaller = jaxbContext.createMarshaller();
contextMarshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
contextMarshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
return contextMarshaller;
} catch (Exception e) {
throw new RuntimeException("JAXB marshaller creation failed", e);
}
}
}
13 changes: 13 additions & 0 deletions xml-parser/src/test/resources/google/entry_v4.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<entry>
<accession>A0A2E0WTX0</accession>
<protein>
<submittedName>
<fullName>NAD(P)H-dependent oxidoreductase</fullName>
</submittedName>
</protein>
<comment>
<type>similarity</type>
<text>Belongs to the nitroreductase family.</text>
</comment>
</entry>
15 changes: 15 additions & 0 deletions xml-parser/src/test/resources/google/entry_v4_modified.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="utf-8"?>
<uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/docs/uniprot.xsd">
<entry>
<accession>A0A2E0WTX0</accession>
<name>Dummy Value</name>
<protein>
<submittedName>
<fullName>NAD(P)H-dependent oxidoreductase</fullName>
</submittedName>
</protein>
<comment type="similarity">
<text>Belongs to the nitroreductase family.</text>
</comment>
</entry>
</uniprot>