diff --git a/repository/BioParserTests/BioProteinParserTest.class.st b/repository/BioParserTests/BioProteinParserTest.class.st index 64eaf9ac..a537b39b 100644 --- a/repository/BioParserTests/BioProteinParserTest.class.st +++ b/repository/BioParserTests/BioProteinParserTest.class.st @@ -8,13 +8,13 @@ Class { BioProteinParserTest >> setUp [ super setUp. - parser := #proteinSequence asPParser. + parser := #proteinLetterGapped asPParser. ] { #category : #testing } BioProteinParserTest >> testProteinLetterMatches [ - parser := #proteinLetter asPParser. + parser := #proteinLetterGapped asPParser. 'ACDEFGHIKLMNPQRSTVWYBXZJUO' do: [: letter | self assert: (parser matches: (String with: letter))]. diff --git a/repository/BioParsers/BioAbstractParser.class.st b/repository/BioParsers/BioAbstractParser.class.st index 318e2fbf..5b562951 100644 --- a/repository/BioParsers/BioAbstractParser.class.st +++ b/repository/BioParsers/BioAbstractParser.class.st @@ -54,6 +54,36 @@ BioAbstractParser >> buildTokens: aCollection [ ^ self results ] +{ #category : #accessing } +BioAbstractParser >> debug [ + " Private - Debug the receiver's expression " + + (results isNil or: [ results isEmpty ]) + ifTrue: [ self debug: expression ]. + ^ results +] + +{ #category : #accessing } +BioAbstractParser >> debug: aString [ + " Debug aString with the receiver's parser " + + self debugString: aString. + +] + +{ #category : #accessing } +BioAbstractParser >> debugString: aString [ + " Answer an object with the result of parsing aString with the receiver's parser " + + self beSuccess. + ^ [ parser debug: aString ] + on: Exception + do: [: ex | + self beFailed. + ex asString ]. + +] + { #category : #accessing } BioAbstractParser >> expression [ " Answer a String with the receiver's target expression " @@ -70,6 +100,7 @@ BioAbstractParser >> expression: anObject [ BioAbstractParser >> initializeWith: aString [ " Private - Set the receiver's target expression to be parsed to aString " + self initialize. expression := aString ] diff --git a/repository/BioParsers/BioAbstractTextParser.class.st b/repository/BioParsers/BioAbstractTextParser.class.st index a772f8f9..3e79fe0a 100644 --- a/repository/BioParsers/BioAbstractTextParser.class.st +++ b/repository/BioParsers/BioAbstractTextParser.class.st @@ -52,7 +52,7 @@ BioAbstractTextParser >> parserForAccession [ BioAbstractTextParser >> parserForDNAAlignment [ " Answer a parser for alignments, meaning sequences containing gaps " - ^ #dnaAmbiguousLetterGapped asPParser + ^ #dnaAmbiguousLetterGapped asPParser ] { #category : #'accessing-parsers' } diff --git a/repository/BioParsers/BioFASTABasicParser.class.st b/repository/BioParsers/BioFASTABasicParser.class.st index d4481bed..3a832ddd 100644 --- a/repository/BioParsers/BioFASTABasicParser.class.st +++ b/repository/BioParsers/BioFASTABasicParser.class.st @@ -4,6 +4,9 @@ This class is not intended to be used directly Class { #name : #BioFASTABasicParser, #superclass : #BioIDParser, + #instVars : [ + 'fastaSequenceParser' + ], #category : #'BioParsers-Core' } @@ -38,25 +41,62 @@ BioFASTABasicParser >> buildTokens: aCollection [ ] { #category : #initialization } -BioFASTABasicParser >> initialize [ - " Private - See superimplementor's comment " +BioFASTABasicParser >> fastaParser [ - super initialize. - parser := ( + ^ ( self parserForNonBreakingString , #newline asPParser , - self parserForSequence) plus end + (self perform: self fastaSequenceParser) + ) plus end ] +{ #category : #initialization } +BioFASTABasicParser >> fastaSequenceParser [ + + ^ fastaSequenceParser + +] + +{ #category : #accessing } +BioFASTABasicParser >> fastaSequenceParser: anObject [ + + fastaSequenceParser := anObject +] + +{ #category : #'initialize-release' } +BioFASTABasicParser >> initializeWith: aString [ + " Private - See superimplementor's comment " + + super initializeWith: aString. + self fastaSequenceParser: #parserForSequence. + parser := self fastaParser. + +] + +{ #category : #'accessing-parsers' } +BioFASTABasicParser >> newLineParser [ + + ^ #newline asPParser plus optional +] + { #category : #'accessing-parsers' } BioFASTABasicParser >> parserForSequence [ " Answer a parser for parsing sequences as usually found in FASTA formatted files " - ^ (#word asPParser plus flatten , - #newline asPParser plus optional ==> [ :nodes | (nodes copyWithout: Character cr) ]) - min: 1 + ^ ( + ( + #word asPParser plus flatten , + self newLineParser ==> self removeNewlinesBlock + ) min: 1 + ) optimize +] + +{ #category : #removing } +BioFASTABasicParser >> removeNewlinesBlock [ + + ^ [ :nodes | nodes copyWithoutAll: { Character cr . Character lf } ] ] { #category : #'accessing private' } diff --git a/repository/BioParsers/BioFASTAMultiParser.class.st b/repository/BioParsers/BioFASTAMultiParser.class.st index e2f2bd67..34720d40 100644 --- a/repository/BioParsers/BioFASTAMultiParser.class.st +++ b/repository/BioParsers/BioFASTAMultiParser.class.st @@ -7,6 +7,24 @@ Class { #category : #'BioParsers-Core' } +{ #category : #'instance creation' } +BioFASTAMultiParser class >> onAmbigousGapped: anExpressionString [ + + ^ self basicNew + fastaSequenceParser: #parserForAmbibuousGappedSequence; + initializeWith: anExpressionString; + yourself +] + +{ #category : #'instance creation' } +BioFASTAMultiParser class >> onUnambigousGapped: anExpressionString [ + + ^ self basicNew + fastaSequenceParser: #parserForUnambibuousGappedSequence; + initializeWith: anExpressionString; + yourself +] + { #category : #'accessing private' } BioFASTAMultiParser >> buildResults: aCollection [ " Answer an identified object for the receiver's parsing output " @@ -26,3 +44,49 @@ BioFASTAMultiParser >> parseResultClass [ ^ BioFastaMultiRecord ] + +{ #category : #'accessing-parsers' } +BioFASTAMultiParser >> parserForAmbibuousGappedSequence [ + " Configure the receiver's parser to parse DNA alignment (gaps are allowed) " + + ^ ( + ( + #dnaAmbiguousLetterGapped asPParser plus flatten , + self newLineParser + ) ==> self removeNewlinesBlock + ) min: 1 + +] + +{ #category : #'accessing-parsers' } +BioFASTAMultiParser >> parserForAmbibuousSequence [ + " Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) " + + ^ ( + #dnaAmbiguousLetter asPParser plus flatten , + self newLineParser + ) min: 1 + +] + +{ #category : #'accessing-parsers' } +BioFASTAMultiParser >> parserForUnambibuousGappedSequence [ + " Configure the receiver's parser to parse DNA alignment (gaps are allowed) " + + ^ ( + #dnaUnambiguousLetterGapped asPParser plus flatten , + self newLineParser + ) min: 1 + +] + +{ #category : #'accessing-parsers' } +BioFASTAMultiParser >> parserForUnambibuousSequence [ + " Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) " + + ^ ( + #dnaSequence asPParser plus flatten , + self newLineParser + ) min: 1 + +] diff --git a/repository/BioParsers/BioMultiFASTAAlignmentParser.class.st b/repository/BioParsers/BioMultiFASTAAlignmentParser.class.st deleted file mode 100644 index 622e0613..00000000 --- a/repository/BioParsers/BioMultiFASTAAlignmentParser.class.st +++ /dev/null @@ -1,16 +0,0 @@ -Class { - #name : #BioMultiFASTAAlignmentParser, - #superclass : #BioFASTAMultiParser, - #category : #'BioParsers-Core' -} - -{ #category : #'accessing-parsers' } -BioMultiFASTAAlignmentParser >> parserForSequence [ - " Configure the receiver's parser to parse DNA alignment (gaps are allowed) " - - ^ (#dnaAmbiguousLetterGapped asPParser plus flatten , - #newline asPParser plus optional ==> [ :nodes | - nodes copyWithoutAll: { Character cr . Character lf } ]) - min: 1 - -] diff --git a/repository/BioParsers/BioNCBIXMLBlastParser.class.st b/repository/BioParsers/BioNCBIXMLBlastParser.class.st index e533a269..671913e8 100644 --- a/repository/BioParsers/BioNCBIXMLBlastParser.class.st +++ b/repository/BioParsers/BioNCBIXMLBlastParser.class.st @@ -8,8 +8,7 @@ Class { BioNCBIXMLBlastParser >> initializeWith: aString [ " Private - Set the receiver's parser " - super initialize. - self expression: aString. + super initializeWith: aString. parser := XMLPullParser parse: self expression. ] diff --git a/repository/BioParsers/BioParser.class.st b/repository/BioParsers/BioParser.class.st index b07f45ec..9e5ef56d 100644 --- a/repository/BioParsers/BioParser.class.st +++ b/repository/BioParsers/BioParser.class.st @@ -109,7 +109,7 @@ BioParser class >> parseMultiFasta: aFastaString [ " Parser aFastaString representing a MultiFASTA sequence. Answer a FastaMultiRecord object " - ^ (BioFASTAMultiParser on: aFastaString) parse + ^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse ] { #category : #'parse-fasta' } @@ -117,15 +117,23 @@ BioParser class >> parseMultiFastaAlignment: aFastaString [ " Parser aFastaString representing a containing multiple sequences in FASTA format. Answer a object " - ^ (BioMultiFASTAAlignmentParser on: aFastaString) parse + ^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse ] { #category : #'parse-fasta' } BioParser class >> parseMultiFastaAlignmentFile: aFastaFullFileLocation [ " Parse aFastaFullFileLocation representing a containing a file with multiple sequences in FASTA format. Answer a object " - ^ (BioMultiFASTAAlignmentParser - on: (self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment + ^ (BioFASTAMultiParser onAmbigousGapped: + (self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment +] + +{ #category : #'parse-fasta' } +BioParser class >> parseMultiFastaAlignmentFileFast: aFastaFullFileLocation [ + " Parse aFastaFullFileLocation representing a containing a file with multiple sequences in FASTA format. Answer a object " + + ^ (BioFASTAMultiParser + onAmbigousGapped: (self openFullFileNamed: aFastaFullFileLocation)) parse asAlignment ] { #category : #'parse-fasta' } @@ -253,7 +261,7 @@ BioParser class >> tokenizeCSV: aCSVStringOrStream delimiter: aCharacter [ BioParser class >> tokenizeFasta: aString [ " Tokenize aString representing a FASTA sequence " - ^ ( BioFASTABasicParser on: aString ) tokenize first + ^ (BioFASTABasicParser on: aString) tokenize first ] { #category : #'parse-fasta' } diff --git a/repository/BioParsers/BioPhylipParser.class.st b/repository/BioParsers/BioPhylipParser.class.st index 42f1881f..00f7ebe8 100644 --- a/repository/BioParsers/BioPhylipParser.class.st +++ b/repository/BioParsers/BioPhylipParser.class.st @@ -208,9 +208,9 @@ BioPhylipParser >> tokenizeInterleavedProtein [ parser := self firstLineTokenizer , (((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , - #proteinLetter asPParser trimBlanks star flatten , + #proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star , - (#proteinLetter asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. + (#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. ^ self tokenize. ] diff --git a/repository/BioParsers/PP2NodeFactory.extension.st b/repository/BioParsers/PP2NodeFactory.extension.st index 34c6fa7d..d932c819 100644 --- a/repository/BioParsers/PP2NodeFactory.extension.st +++ b/repository/BioParsers/PP2NodeFactory.extension.st @@ -1,13 +1,17 @@ Extension { #name : #PP2NodeFactory } +{ #category : #'*BioParsers' } +PP2NodeFactory >> dnaAmbiguousLetterGapped [ + " Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature " + + ^ PP2PredicateObjectNode anyOf: BioIUPACAmbiguousDNA lowercaseAndUppercaseCodesWithGap +] + { #category : #'*BioParsers' } PP2NodeFactory >> dnaLetter [ "Answer a parse to parse any uppercased or lowercased DNA nucleotide letter" - ^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream | - stream - << BioIUPACUnambiguousDNA uppercaseCodes; - << BioIUPACUnambiguousDNA lowercaseCodes ]) + ^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodes ] { #category : #'*BioParsers' } @@ -17,6 +21,13 @@ PP2NodeFactory >> dnaSequence [ ^ self dnaLetter plus flatten end ] +{ #category : #'*BioParsers' } +PP2NodeFactory >> dnaUnambiguousLetterGapped [ + " Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature " + + ^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodesWithGap +] + { #category : #'*BioParsers' } PP2NodeFactory >> number [ @@ -24,29 +35,27 @@ PP2NodeFactory >> number [ ] { #category : #'*BioParsers' } -PP2NodeFactory >> proteinLetter [ +PP2NodeFactory >> proteinLetterGapped [ " Answer a parser for a protein letter, using extended IUPAC nomenclature " - ^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream | - stream - << BioIUPACProteinExtended codes asUppercase; - << BioIUPACProteinExtended codes asLowercase; - << '?-' ]) + ^ PP2PredicateObjectNode anyOf: BioIUPACProteinExtended lowercaseAndUppercaseCodesWithGap ] { #category : #'*BioParsers' } -PP2NodeFactory >> proteinSequence [ - " Answer a parser for Protein sequences, using IUPAC nomenclature " - - ^ self proteinLetter plus flatten end +PP2NodeFactory >> punctuation [ + + ^ PP2PredicateObjectNode new + predicate: (PP2CharSetPredicate on: [ :char | + self punctuationCharacters includes: char ]); + predicateMessage: 'punctuation expected'; + negated: (PP2CharSetPredicate on: [ :char | + (self punctuationCharacters includes: char) not ]); + negatedMessage: 'punctuation not expected'; + yourself ] { #category : #'*BioParsers' } -PP2NodeFactory >> punctuation [ - ^ PP2PredicateObjectNode new - predicate: (PP2CharSetPredicate on: [ :char | '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char ]); - predicateMessage: 'punctuation expected'; - negated: (PP2CharSetPredicate on: [ :char | ('.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char) not ]); - negatedMessage: 'punctuation not expected'; - yourself +PP2NodeFactory >> punctuationCharacters [ + + ^ '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' ] diff --git a/repository/BioParsers/PPPredicateObjectParser.extension.st b/repository/BioParsers/PPPredicateObjectParser.extension.st index 1f284790..149bf780 100644 --- a/repository/BioParsers/PPPredicateObjectParser.extension.st +++ b/repository/BioParsers/PPPredicateObjectParser.extension.st @@ -4,9 +4,7 @@ Extension { #name : #PPPredicateObjectParser } PPPredicateObjectParser class >> dnaAmbiguousLetter [ " Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature " - ^ self anyOf: - BioIUPACAmbiguousDNA codes asUppercase , - BioIUPACAmbiguousDNA codes asLowercase + ^ PP2PredicateObjectNode anyOf: BioIUPACAmbiguousDNA lowercaseAndUppercaseCodes ] diff --git a/repository/BioPlots/BioRSBaseBuilder.class.st b/repository/BioPlots/BioRSBaseBuilder.class.st index 0c0f98f4..542f12e4 100644 --- a/repository/BioPlots/BioRSBaseBuilder.class.st +++ b/repository/BioPlots/BioRSBaseBuilder.class.st @@ -63,12 +63,12 @@ BioRSBaseBuilder >> bases: anObject [ BioRSBaseBuilder >> colorMap [ ^ { - $A -> Color red . - $C -> Color green . - $T -> Color blue . - $G -> Color yellow . + $A -> (Color fromString: #f4a261) . + $C -> (Color fromString: #e9c46a) . + $T -> (Color fromString: #'2a9d8f') . + $G -> (Color fromString: #e76f51) . $- -> Color white . - $N -> Color black + $N -> (Color fromString: #'264653') } asDictionary ] diff --git a/repository/BioTools/BioAlphabet.class.st b/repository/BioTools/BioAlphabet.class.st index baabebc4..5607c925 100644 --- a/repository/BioTools/BioAlphabet.class.st +++ b/repository/BioTools/BioAlphabet.class.st @@ -115,6 +115,39 @@ BioAlphabet class >> hasAmbiguousBases: aString [ ^ aString asUppercase anySatisfy: [ : letter | self ambiguityCodes includes: letter ] ] +{ #category : #accessing } +BioAlphabet class >> lowercaseAndUppercaseCodes [ + "Answer a with the receiver's codes in both uppercase and lowercase versions" + + ^ String streamContents: [ : stream | + stream + << self lowercaseCodes; + << self uppercaseCodes ] +] + +{ #category : #accessing } +BioAlphabet class >> lowercaseAndUppercaseCodesWithGap [ + "Answer a with the receiver's codes in both uppercase and lowercase versions" + + ^ String streamContents: [ : stream | + stream + << self lowercaseAndUppercaseCodes; + << '-?' ] +] + +{ #category : #accessing } +BioAlphabet class >> lowercaseCodes [ + " See superimplementor's comment " + + ^ self codes asLowercase +] + +{ #category : #accessing } +BioAlphabet class >> uppercaseCodes [ + + ^ self codes +] + { #category : #comparing } BioAlphabet >> = anObject [ "Answer whether the receiver and anObject represent the same object." diff --git a/repository/BioTools/BioIUPACUnambiguousDNA.class.st b/repository/BioTools/BioIUPACUnambiguousDNA.class.st index f1d73280..67ccdcff 100644 --- a/repository/BioTools/BioIUPACUnambiguousDNA.class.st +++ b/repository/BioTools/BioIUPACUnambiguousDNA.class.st @@ -28,15 +28,3 @@ BioIUPACUnambiguousDNA class >> complementaryCases [ ^ #($A $T $T $A $C $G $G $C $N $N $a $t $t $a $c $g $g $c $n $n $- $- $ $ ) ] - -{ #category : #accessing } -BioIUPACUnambiguousDNA class >> lowercaseCodes [ - - ^ self codes asLowercase -] - -{ #category : #accessing } -BioIUPACUnambiguousDNA class >> uppercaseCodes [ - - ^ self codes -]