diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/FlagUpdater.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/FlagUpdater.java index 7bdbb1094..7f628474a 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/FlagUpdater.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/FlagUpdater.java @@ -16,22 +16,27 @@ public class FlagUpdater implements Updater { @Override public ProteinDescription fromXml(ProteinDescription modelObject, SequenceType xmlObject) { - FlagType fType = Optional.ofNullable(modelObject.getFlag()).map(Flag::getType).orElse(null); - String frag = xmlObject.getFragment(); - if (xmlObject.isPrecursor() != null && xmlObject.isPrecursor()) { - if (SINGLE.equals(frag)) { - fType = FlagType.FRAGMENT_PRECURSOR; + ProteinDescriptionBuilder result = ProteinDescriptionBuilder.from(modelObject); + if (xmlObject != null) { + FlagType fType = + Optional.ofNullable(modelObject.getFlag()).map(Flag::getType).orElse(null); + String frag = xmlObject.getFragment(); + if (xmlObject.isPrecursor() != null && xmlObject.isPrecursor()) { + if (SINGLE.equals(frag)) { + fType = FlagType.FRAGMENT_PRECURSOR; + } else if (MULTIPLE.equals(frag)) { + fType = FlagType.FRAGMENTS_PRECURSOR; + } else { + fType = FlagType.PRECURSOR; + } + } else if (SINGLE.equals(frag)) { + fType = FlagType.FRAGMENT; } else if (MULTIPLE.equals(frag)) { - fType = FlagType.FRAGMENTS_PRECURSOR; - } else { - fType = FlagType.PRECURSOR; + fType = FlagType.FRAGMENTS; } - } else if (SINGLE.equals(frag)) { - fType = FlagType.FRAGMENT; - } else if (MULTIPLE.equals(frag)) { - fType = FlagType.FRAGMENTS; + result.flag(fType); } - return ProteinDescriptionBuilder.from(modelObject).flag(fType).build(); + return result.build(); } @Override diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/OrganismConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/OrganismConverter.java index bbf0285bd..849981777 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/OrganismConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/OrganismConverter.java @@ -4,6 +4,7 @@ import org.uniprot.core.uniprotkb.taxonomy.Organism; import org.uniprot.core.uniprotkb.taxonomy.impl.OrganismBuilder; +import org.uniprot.core.util.Utils; import org.uniprot.core.xml.Converter; import org.uniprot.core.xml.jaxb.uniprot.ObjectFactory; import org.uniprot.core.xml.jaxb.uniprot.OrganismType; @@ -24,10 +25,18 @@ public OrganismConverter(EvidenceIndexMapper evRefMapper, ObjectFactory xmlUnipr @Override public Organism fromXml(OrganismType xmlObj) { OrganismBuilder builder = new OrganismBuilder(); - builder.taxonId(Long.parseLong(xmlObj.getDbReference().get(0).getId())); - OrganismConverterUtil.updateOrganismNameFromXml(xmlObj.getName(), builder); - builder.lineagesSet(xmlObj.getLineage().getTaxon()); - builder.evidencesSet(evRefMapper.parseEvidenceIds(xmlObj.getEvidence())); + if (xmlObj != null) { + if (Utils.notNullNotEmpty(xmlObj.getDbReference())) { + builder.taxonId(Long.parseLong(xmlObj.getDbReference().get(0).getId())); + } + if (Utils.notNullNotEmpty(xmlObj.getName())) { + OrganismConverterUtil.updateOrganismNameFromXml(xmlObj.getName(), builder); + } + if (xmlObj.getLineage() != null) { + builder.lineagesSet(xmlObj.getLineage().getTaxon()); + } + builder.evidencesSet(evRefMapper.parseEvidenceIds(xmlObj.getEvidence())); + } return builder.build(); } diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/SequenceConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/SequenceConverter.java index 9eff14dbc..01eb914f7 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/SequenceConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/SequenceConverter.java @@ -19,7 +19,10 @@ public SequenceConverter(ObjectFactory xmlUniprotFactory) { @Override public Sequence fromXml(SequenceType xmlObj) { - String sequence = xmlObj.getValue(); + String sequence = ""; + if (xmlObj != null) { + sequence = xmlObj.getValue(); + } // sequence = sequence.replaceAll(" ", ""); return new SequenceBuilder(sequence).build(); } diff --git a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/UniProtEntryConverter.java b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/UniProtEntryConverter.java index 28f844d7c..62f1075d4 100644 --- a/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/UniProtEntryConverter.java +++ b/xml-parser/src/main/java/org/uniprot/core/xml/uniprot/UniProtEntryConverter.java @@ -16,6 +16,7 @@ import org.uniprot.core.uniprotkb.impl.EntryAuditBuilder; import org.uniprot.core.uniprotkb.impl.UniProtKBAccessionBuilder; import org.uniprot.core.uniprotkb.impl.UniProtKBEntryBuilder; +import org.uniprot.core.util.Utils; import org.uniprot.core.xml.Converter; import org.uniprot.core.xml.jaxb.uniprot.*; import org.uniprot.core.xml.uniprot.citation.ReferenceConverter; @@ -166,30 +167,32 @@ private SequenceType toXmlForSequence(UniProtKBEntry entry) { // ..with multiple interactions. private List fromXmlForComments(Entry xmlEntry) { List uniComments = new ArrayList<>(); - List comments = xmlEntry.getComment(); - List interactionComment = - comments.stream() - .filter(val -> val.getType().equals(INTERACTION)) - .collect(Collectors.toList()); + if (Utils.notNullNotEmpty(xmlEntry.getComment())) { + List comments = xmlEntry.getComment(); + List interactionComment = + comments.stream() + .filter(val -> val.getType().equals(INTERACTION)) + .collect(Collectors.toList()); - boolean interactionsFirst = true; - for (org.uniprot.core.xml.jaxb.uniprot.CommentType commentType : comments) { - if (commentType.getType().equals(INTERACTION)) { - if (interactionsFirst) { - interactionsFirst = false; + boolean interactionsFirst = true; + for (org.uniprot.core.xml.jaxb.uniprot.CommentType commentType : comments) { + if (commentType.getType().equals(INTERACTION)) { + if (interactionsFirst) { + interactionsFirst = false; + uniComments.add( + CommentConverterFactory.INSTANCE + .createInteractionCommentConverter(this.xmlUniprotFactory) + .fromXml(interactionComment)); + } + } else { + org.uniprot.core.uniprotkb.comment.CommentType type = + org.uniprot.core.uniprotkb.comment.CommentType.typeOf( + commentType.getType()); uniComments.add( CommentConverterFactory.INSTANCE - .createInteractionCommentConverter(this.xmlUniprotFactory) - .fromXml(interactionComment)); + .createCommentConverter(type, evRefMapper, xmlUniprotFactory) + .fromXml(commentType)); } - } else { - org.uniprot.core.uniprotkb.comment.CommentType type = - org.uniprot.core.uniprotkb.comment.CommentType.typeOf( - commentType.getType()); - uniComments.add( - CommentConverterFactory.INSTANCE - .createCommentConverter(type, evRefMapper, xmlUniprotFactory) - .fromXml(commentType)); } } return uniComments; @@ -233,9 +236,9 @@ private UniProtKBEntryBuilder createUniprotEntryBuilderFromXml(Entry xmlEntry) { List accessions = xmlEntry.getAccession(); return new UniProtKBEntryBuilder( accessions.get(0), - xmlEntry.getName().get(0), + getUniProtId(xmlEntry), UniProtKBEntryType.typeOf(xmlEntry.getDataset())) - .proteinExistence(ProteinExistence.typeOf(xmlEntry.getProteinExistence().getType())) + .proteinExistence(getProteinExistence(xmlEntry)) .secondaryAccessionsSet( accessions.subList(1, accessions.size()).stream() .map(sec -> new UniProtKBAccessionBuilder(sec).build()) @@ -243,12 +246,32 @@ private UniProtKBEntryBuilder createUniprotEntryBuilderFromXml(Entry xmlEntry) { .entryAudit(entryAuditFromXml(xmlEntry)); } + private ProteinExistence getProteinExistence(Entry xmlEntry) { + ProteinExistence proteinExistence = ProteinExistence.UNKNOWN; + if (xmlEntry.getProteinExistence() != null) { + proteinExistence = ProteinExistence.typeOf(xmlEntry.getProteinExistence().getType()); + } + return proteinExistence; + } + + private String getUniProtId(Entry xmlEntry) { + String uniProtId = ""; + if (Utils.notNullNotEmpty(xmlEntry.getName())) { + uniProtId = xmlEntry.getName().get(0); + } + return uniProtId; + } + private EntryAudit entryAuditFromXml(Entry xmlEntry) { int version = xmlEntry.getVersion(); LocalDate firstPublic = XmlConverterHelper.dateFromXml(xmlEntry.getCreated()); LocalDate lastUpdated = XmlConverterHelper.dateFromXml(xmlEntry.getModified()); - int seqVersion = xmlEntry.getSequence().getVersion(); - LocalDate seqDate = XmlConverterHelper.dateFromXml(xmlEntry.getSequence().getModified()); + int seqVersion = 0; + LocalDate seqDate = null; + if (xmlEntry.getSequence() != null) { + seqVersion = xmlEntry.getSequence().getVersion(); + seqDate = XmlConverterHelper.dateFromXml(xmlEntry.getSequence().getModified()); + } return new EntryAuditBuilder() .firstPublic(firstPublic) .lastAnnotationUpdate(lastUpdated) diff --git a/xml-parser/src/test/java/org/uniprot/core/xml/ConverterXMLToFFTest.java b/xml-parser/src/test/java/org/uniprot/core/xml/ConverterXMLToFFTest.java new file mode 100644 index 000000000..749d99270 --- /dev/null +++ b/xml-parser/src/test/java/org/uniprot/core/xml/ConverterXMLToFFTest.java @@ -0,0 +1,66 @@ +package org.uniprot.core.xml; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.time.LocalDate; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.Marshaller; +import javax.xml.bind.Unmarshaller; + +import org.junit.jupiter.api.Test; +import org.uniprot.core.flatfile.writer.impl.UniProtFlatfileWriter; +import org.uniprot.core.uniprotkb.UniProtKBEntry; +import org.uniprot.core.uniprotkb.impl.EntryAuditBuilder; +import org.uniprot.core.uniprotkb.impl.UniProtKBEntryBuilder; +import org.uniprot.core.xml.jaxb.uniprot.Uniprot; +import org.uniprot.core.xml.uniprot.UniProtEntryConverter; + +class ConverterXMLToFFTest { + // XSD --> + // https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot.xsd + + @Test + void testXMLToFF() throws Exception { + String file = "/google/entry_v4_modified.xml"; + InputStream inputStream = ConverterXMLToFFTest.class.getResourceAsStream(file); + JAXBContext jaxbContext = JAXBContext.newInstance("org.uniprot.core.xml.jaxb.uniprot"); + + Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller(); + Uniprot xmlEntry = (Uniprot) jaxbUnmarshaller.unmarshal(inputStream); + assertNotNull(xmlEntry); + + UniProtEntryConverter converter = new UniProtEntryConverter(); + UniProtKBEntry uniprotEntry = + xmlEntry.getEntry().stream().map(converter::fromXml).findFirst().orElse(null); + assertNotNull(uniprotEntry); + + UniProtKBEntry auditedEntry = + UniProtKBEntryBuilder.from(uniprotEntry) + .entryAudit( + new EntryAuditBuilder() + .firstPublic(LocalDate.now()) + .lastAnnotationUpdate(LocalDate.now()) + .lastSequenceUpdate(LocalDate.now()) + .build()) + .build(); + + // "XML FILE" --> "FF PARTIAL" (Curator can use to validate) + + String ffResult = UniProtFlatfileWriter.write(auditedEntry); + assertNotNull(ffResult); + System.out.println(ffResult); + } + + protected Marshaller createMarshaller(JAXBContext jaxbContext) { + try { + Marshaller contextMarshaller = jaxbContext.createMarshaller(); + contextMarshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE); + contextMarshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE); + return contextMarshaller; + } catch (Exception e) { + throw new RuntimeException("JAXB marshaller creation failed", e); + } + } +} diff --git a/xml-parser/src/test/resources/google/entry_v4.xml b/xml-parser/src/test/resources/google/entry_v4.xml new file mode 100644 index 000000000..ecee62730 --- /dev/null +++ b/xml-parser/src/test/resources/google/entry_v4.xml @@ -0,0 +1,13 @@ + + + A0A2E0WTX0 + + + NAD(P)H-dependent oxidoreductase + + + + similarity + Belongs to the nitroreductase family. + + \ No newline at end of file diff --git a/xml-parser/src/test/resources/google/entry_v4_modified.xml b/xml-parser/src/test/resources/google/entry_v4_modified.xml new file mode 100644 index 000000000..747fe03c4 --- /dev/null +++ b/xml-parser/src/test/resources/google/entry_v4_modified.xml @@ -0,0 +1,15 @@ + + + + A0A2E0WTX0 + Dummy Value + + + NAD(P)H-dependent oxidoreductase + + + + Belongs to the nitroreductase family. + + + \ No newline at end of file