Skip to content

Commit

Permalink
Refactorings around parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
Hernán Morales Durand committed Aug 25, 2023
1 parent 61ccedb commit 8c34b6b
Show file tree
Hide file tree
Showing 14 changed files with 231 additions and 77 deletions.
4 changes: 2 additions & 2 deletions repository/BioParserTests/BioProteinParserTest.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ Class {
BioProteinParserTest >> setUp [

super setUp.
parser := #proteinSequence asPParser.
parser := #proteinLetterGapped asPParser.
]

{ #category : #testing }
BioProteinParserTest >> testProteinLetterMatches [

parser := #proteinLetter asPParser.
parser := #proteinLetterGapped asPParser.

'ACDEFGHIKLMNPQRSTVWYBXZJUO' do: [: letter |
self assert: (parser matches: (String with: letter))].
Expand Down
31 changes: 31 additions & 0 deletions repository/BioParsers/BioAbstractParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,36 @@ BioAbstractParser >> buildTokens: aCollection [
^ self results
]

{ #category : #accessing }
BioAbstractParser >> debug [
" Private - Debug the receiver's expression "

(results isNil or: [ results isEmpty ])
ifTrue: [ self debug: expression ].
^ results
]

{ #category : #accessing }
BioAbstractParser >> debug: aString [
" Debug aString with the receiver's parser "

self debugString: aString.

]

{ #category : #accessing }
BioAbstractParser >> debugString: aString [
" Answer an object with the result of parsing aString with the receiver's parser "

self beSuccess.
^ [ parser debug: aString ]
on: Exception
do: [: ex |
self beFailed.
ex asString ].

]

{ #category : #accessing }
BioAbstractParser >> expression [
" Answer a String with the receiver's target expression "
Expand All @@ -70,6 +100,7 @@ BioAbstractParser >> expression: anObject [
BioAbstractParser >> initializeWith: aString [
" Private - Set the receiver's target expression to be parsed to aString "

self initialize.
expression := aString
]

Expand Down
2 changes: 1 addition & 1 deletion repository/BioParsers/BioAbstractTextParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ BioAbstractTextParser >> parserForAccession [
BioAbstractTextParser >> parserForDNAAlignment [
" Answer a parser for alignments, meaning sequences containing gaps "

^ #dnaAmbiguousLetterGapped asPParser
^ #dnaAmbiguousLetterGapped asPParser
]

{ #category : #'accessing-parsers' }
Expand Down
56 changes: 48 additions & 8 deletions repository/BioParsers/BioFASTABasicParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ This class is not intended to be used directly
Class {
#name : #BioFASTABasicParser,
#superclass : #BioIDParser,
#instVars : [
'fastaSequenceParser'
],
#category : #'BioParsers-Core'
}

Expand Down Expand Up @@ -38,25 +41,62 @@ BioFASTABasicParser >> buildTokens: aCollection [
]

{ #category : #initialization }
BioFASTABasicParser >> initialize [
" Private - See superimplementor's comment "
BioFASTABasicParser >> fastaParser [

super initialize.
parser := (
^ (
self parserForNonBreakingString ,
#newline asPParser ,
self parserForSequence) plus end
(self perform: self fastaSequenceParser)
) plus end


]

{ #category : #initialization }
BioFASTABasicParser >> fastaSequenceParser [

^ fastaSequenceParser

]

{ #category : #accessing }
BioFASTABasicParser >> fastaSequenceParser: anObject [

fastaSequenceParser := anObject
]

{ #category : #'initialize-release' }
BioFASTABasicParser >> initializeWith: aString [
" Private - See superimplementor's comment "

super initializeWith: aString.
self fastaSequenceParser: #parserForSequence.
parser := self fastaParser.

]

{ #category : #'accessing-parsers' }
BioFASTABasicParser >> newLineParser [

^ #newline asPParser plus optional
]

{ #category : #'accessing-parsers' }
BioFASTABasicParser >> parserForSequence [
" Answer a parser for parsing sequences as usually found in FASTA formatted files "

^ (#word asPParser plus flatten ,
#newline asPParser plus optional ==> [ :nodes | (nodes copyWithout: Character cr) ])
min: 1
^ (
(
#word asPParser plus flatten ,
self newLineParser ==> self removeNewlinesBlock
) min: 1
) optimize
]

{ #category : #removing }
BioFASTABasicParser >> removeNewlinesBlock [

^ [ :nodes | nodes copyWithoutAll: { Character cr . Character lf } ]
]

{ #category : #'accessing private' }
Expand Down
64 changes: 64 additions & 0 deletions repository/BioParsers/BioFASTAMultiParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,24 @@ Class {
#category : #'BioParsers-Core'
}

{ #category : #'instance creation' }
BioFASTAMultiParser class >> onAmbigousGapped: anExpressionString [

^ self basicNew
fastaSequenceParser: #parserForAmbibuousGappedSequence;
initializeWith: anExpressionString;
yourself
]

{ #category : #'instance creation' }
BioFASTAMultiParser class >> onUnambigousGapped: anExpressionString [

^ self basicNew
fastaSequenceParser: #parserForUnambibuousGappedSequence;
initializeWith: anExpressionString;
yourself
]

{ #category : #'accessing private' }
BioFASTAMultiParser >> buildResults: aCollection [
" Answer an identified object for the receiver's parsing output "
Expand All @@ -26,3 +44,49 @@ BioFASTAMultiParser >> parseResultClass [

^ BioFastaMultiRecord
]

{ #category : #'accessing-parsers' }
BioFASTAMultiParser >> parserForAmbibuousGappedSequence [
" Configure the receiver's parser to parse DNA alignment (gaps are allowed) "

^ (
(
#dnaAmbiguousLetterGapped asPParser plus flatten ,
self newLineParser
) ==> self removeNewlinesBlock
) min: 1

]

{ #category : #'accessing-parsers' }
BioFASTAMultiParser >> parserForAmbibuousSequence [
" Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) "

^ (
#dnaAmbiguousLetter asPParser plus flatten ,
self newLineParser
) min: 1

]

{ #category : #'accessing-parsers' }
BioFASTAMultiParser >> parserForUnambibuousGappedSequence [
" Configure the receiver's parser to parse DNA alignment (gaps are allowed) "

^ (
#dnaUnambiguousLetterGapped asPParser plus flatten ,
self newLineParser
) min: 1

]

{ #category : #'accessing-parsers' }
BioFASTAMultiParser >> parserForUnambibuousSequence [
" Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) "

^ (
#dnaSequence asPParser plus flatten ,
self newLineParser
) min: 1

]
16 changes: 0 additions & 16 deletions repository/BioParsers/BioMultiFASTAAlignmentParser.class.st

This file was deleted.

3 changes: 1 addition & 2 deletions repository/BioParsers/BioNCBIXMLBlastParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ Class {
BioNCBIXMLBlastParser >> initializeWith: aString [
" Private - Set the receiver's parser "

super initialize.
self expression: aString.
super initializeWith: aString.
parser := XMLPullParser parse: self expression.
]

Expand Down
18 changes: 13 additions & 5 deletions repository/BioParsers/BioParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -109,23 +109,31 @@ BioParser class >> parseMultiFasta: aFastaString [
" Parser aFastaString representing a MultiFASTA sequence.
Answer a FastaMultiRecord object "

^ (BioFASTAMultiParser on: aFastaString) parse
^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse
]

{ #category : #'parse-fasta' }
BioParser class >> parseMultiFastaAlignment: aFastaString [
" Parser aFastaString representing a <String> containing multiple sequences in FASTA format.
Answer a <BioFastaMultiRecord> object "

^ (BioMultiFASTAAlignmentParser on: aFastaString) parse
^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse
]

{ #category : #'parse-fasta' }
BioParser class >> parseMultiFastaAlignmentFile: aFastaFullFileLocation [
" Parse aFastaFullFileLocation representing a <String> containing a file with multiple sequences in FASTA format. Answer a <BioAlignment> object "

^ (BioMultiFASTAAlignmentParser
on: (self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment
^ (BioFASTAMultiParser onAmbigousGapped:
(self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment
]

{ #category : #'parse-fasta' }
BioParser class >> parseMultiFastaAlignmentFileFast: aFastaFullFileLocation [
" Parse aFastaFullFileLocation representing a <String> containing a file with multiple sequences in FASTA format. Answer a <BioAlignment> object "

^ (BioFASTAMultiParser
onAmbigousGapped: (self openFullFileNamed: aFastaFullFileLocation)) parse asAlignment
]

{ #category : #'parse-fasta' }
Expand Down Expand Up @@ -253,7 +261,7 @@ BioParser class >> tokenizeCSV: aCSVStringOrStream delimiter: aCharacter [
BioParser class >> tokenizeFasta: aString [
" Tokenize aString representing a FASTA sequence "

^ ( BioFASTABasicParser on: aString ) tokenize first
^ (BioFASTABasicParser on: aString) tokenize first
]

{ #category : #'parse-fasta' }
Expand Down
4 changes: 2 additions & 2 deletions repository/BioParsers/BioPhylipParser.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@ BioPhylipParser >> tokenizeInterleavedProtein [
parser :=
self firstLineTokenizer ,
(((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten ,
#proteinLetter asPParser trimBlanks star flatten ,
#proteinLetterGapped asPParser trimBlanks star flatten ,
#newline asPParser) star ,
(#proteinLetter asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock.
(#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock.
^ self tokenize.

]
51 changes: 30 additions & 21 deletions repository/BioParsers/PP2NodeFactory.extension.st
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
Extension { #name : #PP2NodeFactory }

{ #category : #'*BioParsers' }
PP2NodeFactory >> dnaAmbiguousLetterGapped [
" Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature "

^ PP2PredicateObjectNode anyOf: BioIUPACAmbiguousDNA lowercaseAndUppercaseCodesWithGap
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> dnaLetter [
"Answer a parse to parse any uppercased or lowercased DNA nucleotide letter"

^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream |
stream
<< BioIUPACUnambiguousDNA uppercaseCodes;
<< BioIUPACUnambiguousDNA lowercaseCodes ])
^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodes
]

{ #category : #'*BioParsers' }
Expand All @@ -17,36 +21,41 @@ PP2NodeFactory >> dnaSequence [
^ self dnaLetter plus flatten end
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> dnaUnambiguousLetterGapped [
" Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature "

^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodesWithGap
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> number [

^ PP2ExpressionGrammar new number
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> proteinLetter [
PP2NodeFactory >> proteinLetterGapped [
" Answer a parser for a protein letter, using extended IUPAC nomenclature "

^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream |
stream
<< BioIUPACProteinExtended codes asUppercase;
<< BioIUPACProteinExtended codes asLowercase;
<< '?-' ])
^ PP2PredicateObjectNode anyOf: BioIUPACProteinExtended lowercaseAndUppercaseCodesWithGap
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> proteinSequence [
" Answer a parser for Protein sequences, using IUPAC nomenclature "

^ self proteinLetter plus flatten end
PP2NodeFactory >> punctuation [

^ PP2PredicateObjectNode new
predicate: (PP2CharSetPredicate on: [ :char |
self punctuationCharacters includes: char ]);
predicateMessage: 'punctuation expected';
negated: (PP2CharSetPredicate on: [ :char |
(self punctuationCharacters includes: char) not ]);
negatedMessage: 'punctuation not expected';
yourself
]

{ #category : #'*BioParsers' }
PP2NodeFactory >> punctuation [
^ PP2PredicateObjectNode new
predicate: (PP2CharSetPredicate on: [ :char | '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char ]);
predicateMessage: 'punctuation expected';
negated: (PP2CharSetPredicate on: [ :char | ('.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char) not ]);
negatedMessage: 'punctuation not expected';
yourself
PP2NodeFactory >> punctuationCharacters [

^ '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~'
]
Loading

0 comments on commit 8c34b6b

Please sign in to comment.