Refactorings around parsing

hernanmd · Aug 25, 2023 · 8c34b6b · 8c34b6b
1 parent 61ccedb
commit 8c34b6b
Show file tree

Hide file tree

Showing 14 changed files with 231 additions and 77 deletions.
diff --git a/repository/BioParserTests/BioProteinParserTest.class.st b/repository/BioParserTests/BioProteinParserTest.class.st
@@ -8,13 +8,13 @@ Class {
 BioProteinParserTest >> setUp [
 
 	super setUp.
-	parser := #proteinSequence asPParser.
+	parser := #proteinLetterGapped asPParser.
 ]
 
 { #category : #testing }
 BioProteinParserTest >> testProteinLetterMatches [
 
-	parser := #proteinLetter asPParser.
+	parser := #proteinLetterGapped asPParser.
 
 	'ACDEFGHIKLMNPQRSTVWYBXZJUO' do: [: letter |
 		self assert: (parser matches: (String with: letter))].

diff --git a/repository/BioParsers/BioAbstractParser.class.st b/repository/BioParsers/BioAbstractParser.class.st
@@ -54,6 +54,36 @@ BioAbstractParser >> buildTokens: aCollection [
 	^ self results
 ]
 
+{ #category : #accessing }
+BioAbstractParser >> debug [
+	" Private - Debug the receiver's expression "
+
+	(results isNil or: [ results isEmpty ])
+		ifTrue: [ self debug: expression ].
+	^ results
+]
+
+{ #category : #accessing }
+BioAbstractParser >> debug: aString [
+	" Debug aString with the receiver's parser "
+
+	self debugString: aString.
+
+]
+
+{ #category : #accessing }
+BioAbstractParser >> debugString: aString [
+	" Answer an object with the result of parsing aString with the receiver's parser "
+
+	self beSuccess.
+	^ [ parser debug:  aString ]
+		on: Exception
+		do: [: ex | 
+			self beFailed.
+			ex asString ].
+
+]
+
 { #category : #accessing }
 BioAbstractParser >> expression [
 	" Answer a String with the receiver's target expression "
@@ -70,6 +100,7 @@ BioAbstractParser >> expression: anObject [
 BioAbstractParser >> initializeWith: aString [
 	" Private - Set the receiver's target expression to be parsed to aString "
 
+	self initialize.
 	expression := aString
 ]
 

diff --git a/repository/BioParsers/BioAbstractTextParser.class.st b/repository/BioParsers/BioAbstractTextParser.class.st
@@ -52,7 +52,7 @@ BioAbstractTextParser >> parserForAccession [
 BioAbstractTextParser >> parserForDNAAlignment [
 	" Answer a parser for alignments, meaning sequences containing gaps "
 
-	^ #dnaAmbiguousLetterGapped asPParser
+	^ #dnaAmbiguousLetterGapped asPParser 
 ]
 
 { #category : #'accessing-parsers' }

diff --git a/repository/BioParsers/BioFASTABasicParser.class.st b/repository/BioParsers/BioFASTABasicParser.class.st
@@ -4,6 +4,9 @@ This class is not intended to be used directly
 Class {
 	#name : #BioFASTABasicParser,
 	#superclass : #BioIDParser,
+	#instVars : [
+		'fastaSequenceParser'
+	],
 	#category : #'BioParsers-Core'
 }
 
@@ -38,25 +41,62 @@ BioFASTABasicParser >> buildTokens: aCollection [
 ]
 
 { #category : #initialization }
-BioFASTABasicParser >> initialize [
-	" Private - See superimplementor's comment "
+BioFASTABasicParser >> fastaParser [
 
-	super initialize.
-	parser := ( 
+	^ ( 
 		self parserForNonBreakingString , 
 		#newline asPParser ,
-		self parserForSequence) plus end 
+		(self perform: self fastaSequenceParser)
+		) plus end 
 
 
 ]
 
+{ #category : #initialization }
+BioFASTABasicParser >> fastaSequenceParser [
+
+	^ fastaSequenceParser 		
+
+]
+
+{ #category : #accessing }
+BioFASTABasicParser >> fastaSequenceParser: anObject [
+
+	fastaSequenceParser := anObject
+]
+
+{ #category : #'initialize-release' }
+BioFASTABasicParser >> initializeWith: aString [
+	" Private - See superimplementor's comment "
+
+	super initializeWith: aString.
+	self fastaSequenceParser: #parserForSequence.
+	parser := self fastaParser.
+
+]
+
+{ #category : #'accessing-parsers' }
+BioFASTABasicParser >> newLineParser [
+
+	^ #newline asPParser plus optional
+]
+
 { #category : #'accessing-parsers' }
 BioFASTABasicParser >> parserForSequence [
 	" Answer a parser for parsing sequences as usually found in FASTA formatted files "
 
-	^ (#word asPParser plus flatten , 
-		#newline asPParser plus optional ==> [ :nodes | (nodes copyWithout: Character cr) ]) 
-			min: 1
+	^ (
+			(
+			#word asPParser plus flatten , 
+			self newLineParser ==> self removeNewlinesBlock
+			) min: 1
+		) optimize
+]
+
+{ #category : #removing }
+BioFASTABasicParser >> removeNewlinesBlock [
+
+	^ [ :nodes | nodes copyWithoutAll: { Character cr . Character lf } ]
 ]
 
 { #category : #'accessing private' }

diff --git a/repository/BioParsers/BioFASTAMultiParser.class.st b/repository/BioParsers/BioFASTAMultiParser.class.st
@@ -7,6 +7,24 @@ Class {
 	#category : #'BioParsers-Core'
 }
 
+{ #category : #'instance creation' }
+BioFASTAMultiParser class >> onAmbigousGapped: anExpressionString [
+
+	^ self basicNew
+		fastaSequenceParser: #parserForAmbibuousGappedSequence;
+		initializeWith: anExpressionString;
+		yourself
+]
+
+{ #category : #'instance creation' }
+BioFASTAMultiParser class >> onUnambigousGapped: anExpressionString [
+
+	^ self basicNew
+		fastaSequenceParser: #parserForUnambibuousGappedSequence;
+		initializeWith: anExpressionString;
+		yourself
+]
+
 { #category : #'accessing private' }
 BioFASTAMultiParser >> buildResults: aCollection [
 	" Answer an identified object for the receiver's parsing output "
@@ -26,3 +44,49 @@ BioFASTAMultiParser >> parseResultClass [
 
 	^ BioFastaMultiRecord
 ]
+
+{ #category : #'accessing-parsers' }
+BioFASTAMultiParser >> parserForAmbibuousGappedSequence [
+	" Configure the receiver's parser to parse DNA alignment (gaps are allowed) "
+
+	^ (
+			(
+			#dnaAmbiguousLetterGapped asPParser plus flatten , 
+			self newLineParser
+			) ==> self removeNewlinesBlock
+		)  min: 1
+
+]
+
+{ #category : #'accessing-parsers' }
+BioFASTAMultiParser >> parserForAmbibuousSequence [
+	" Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) "
+
+	^ (
+		#dnaAmbiguousLetter asPParser plus flatten , 
+		self newLineParser
+		) min: 1
+
+]
+
+{ #category : #'accessing-parsers' }
+BioFASTAMultiParser >> parserForUnambibuousGappedSequence [
+	" Configure the receiver's parser to parse DNA alignment (gaps are allowed) "
+
+	^ (
+		#dnaUnambiguousLetterGapped asPParser plus flatten , 
+		self newLineParser
+		) min: 1
+
+]
+
+{ #category : #'accessing-parsers' }
+BioFASTAMultiParser >> parserForUnambibuousSequence [
+	" Configure the receiver's parser to parse DNA alignment (gaps are NOT allowed) "
+
+	^ (
+		#dnaSequence asPParser plus flatten ,
+		self newLineParser
+		) min: 1
+
+]
diff --git a/repository/BioParsers/BioMultiFASTAAlignmentParser.class.st b/repository/BioParsers/BioMultiFASTAAlignmentParser.class.st
diff --git a/repository/BioParsers/BioNCBIXMLBlastParser.class.st b/repository/BioParsers/BioNCBIXMLBlastParser.class.st
@@ -8,8 +8,7 @@ Class {
 BioNCBIXMLBlastParser >> initializeWith: aString [
 	" Private - Set the receiver's parser "
 
-	super initialize.
-	self expression: aString.
+	super initializeWith: aString.
 	parser := XMLPullParser parse: self expression.
 ]
 

diff --git a/repository/BioParsers/BioParser.class.st b/repository/BioParsers/BioParser.class.st
@@ -109,23 +109,31 @@ BioParser class >> parseMultiFasta: aFastaString [
 	" Parser aFastaString representing a MultiFASTA sequence.
 	Answer a FastaMultiRecord object "
 
-	^ (BioFASTAMultiParser on: aFastaString) parse
+	^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse
 ]
 
 { #category : #'parse-fasta' }
 BioParser class >> parseMultiFastaAlignment: aFastaString [
 	" Parser aFastaString representing a <String> containing multiple sequences in FASTA format.
 	Answer a <BioFastaMultiRecord> object "
 
-	^ (BioMultiFASTAAlignmentParser on: aFastaString) parse
+	^ (BioFASTAMultiParser onAmbigousGapped: aFastaString) parse
 ]
 
 { #category : #'parse-fasta' }
 BioParser class >> parseMultiFastaAlignmentFile: aFastaFullFileLocation [
 	" Parse aFastaFullFileLocation representing a <String> containing a file with multiple sequences in FASTA format. Answer a <BioAlignment> object "
 
-	^ (BioMultiFASTAAlignmentParser
-		on: (self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment
+	^ (BioFASTAMultiParser onAmbigousGapped: 
+			(self openFullFileNamed: aFastaFullFileLocation) contents) parse asAlignment
+]
+
+{ #category : #'parse-fasta' }
+BioParser class >> parseMultiFastaAlignmentFileFast: aFastaFullFileLocation [
+	" Parse aFastaFullFileLocation representing a <String> containing a file with multiple sequences in FASTA format. Answer a <BioAlignment> object "
+
+	^ (BioFASTAMultiParser
+		onAmbigousGapped: (self openFullFileNamed: aFastaFullFileLocation)) parse asAlignment
 ]
 
 { #category : #'parse-fasta' }
@@ -253,7 +261,7 @@ BioParser class >> tokenizeCSV: aCSVStringOrStream delimiter: aCharacter [
 BioParser class >> tokenizeFasta: aString [
 	" Tokenize aString representing a FASTA sequence "
 
-	^ ( BioFASTABasicParser on: aString ) tokenize first
+	^ (BioFASTABasicParser on: aString) tokenize first
 ]
 
 { #category : #'parse-fasta' }

diff --git a/repository/BioParsers/BioPhylipParser.class.st b/repository/BioParsers/BioPhylipParser.class.st
@@ -208,9 +208,9 @@ BioPhylipParser >> tokenizeInterleavedProtein [
 	parser := 
 		self firstLineTokenizer ,
 		(((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten ,
-		#proteinLetter asPParser trimBlanks star flatten , 
+		#proteinLetterGapped asPParser trimBlanks star flatten , 
 		#newline asPParser) star ,
-			(#proteinLetter asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock.
+			(#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock.
 	^ self tokenize.
 
 ]
diff --git a/repository/BioParsers/PP2NodeFactory.extension.st b/repository/BioParsers/PP2NodeFactory.extension.st
@@ -1,13 +1,17 @@
 Extension { #name : #PP2NodeFactory }
 
+{ #category : #'*BioParsers' }
+PP2NodeFactory >> dnaAmbiguousLetterGapped [
+	" Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature "
+
+	^ PP2PredicateObjectNode anyOf: BioIUPACAmbiguousDNA lowercaseAndUppercaseCodesWithGap
+]
+
 { #category : #'*BioParsers' }
 PP2NodeFactory >> dnaLetter [
 	"Answer a parse to parse any uppercased or lowercased DNA nucleotide letter"
 
-	^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream |
-		stream 
-			<< 	BioIUPACUnambiguousDNA uppercaseCodes;
-			<< 	BioIUPACUnambiguousDNA lowercaseCodes ])
+	^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodes
 ]
 
 { #category : #'*BioParsers' }
@@ -17,36 +21,41 @@ PP2NodeFactory >> dnaSequence [
 	^ self dnaLetter plus flatten end
 ]
 
+{ #category : #'*BioParsers' }
+PP2NodeFactory >> dnaUnambiguousLetterGapped [
+	" Answer a parser for a DNA letter, using ambiguous IUPAC nomenclature "
+
+	^ PP2PredicateObjectNode anyOf: BioIUPACUnambiguousDNA lowercaseAndUppercaseCodesWithGap
+]
+
 { #category : #'*BioParsers' }
 PP2NodeFactory >> number [
 
 	^ PP2ExpressionGrammar new number
 ]
 
 { #category : #'*BioParsers' }
-PP2NodeFactory >> proteinLetter [
+PP2NodeFactory >> proteinLetterGapped [
 	" Answer a parser for a protein letter, using extended IUPAC nomenclature "
 
-	^ PP2PredicateObjectNode anyOf: (String streamContents: [ : stream |
-		stream
-			<< 	BioIUPACProteinExtended codes asUppercase;
-			<< BioIUPACProteinExtended codes asLowercase;
-			<< 	'?-' ])
+	^ PP2PredicateObjectNode anyOf: BioIUPACProteinExtended lowercaseAndUppercaseCodesWithGap
 ]
 
 { #category : #'*BioParsers' }
-PP2NodeFactory >> proteinSequence [
-	" Answer a parser for Protein sequences, using IUPAC nomenclature "
-
-	^ self proteinLetter plus flatten end
+PP2NodeFactory >> punctuation [
+
+	^ PP2PredicateObjectNode new
+		  predicate: (PP2CharSetPredicate on: [ :char |
+					   self punctuationCharacters includes: char ]);
+		  predicateMessage: 'punctuation expected';
+		  negated: (PP2CharSetPredicate on: [ :char |
+					   (self punctuationCharacters includes: char) not ]);
+		  negatedMessage: 'punctuation not expected';
+		  yourself
 ]
 
 { #category : #'*BioParsers' }
-PP2NodeFactory >> punctuation [
-	^ PP2PredicateObjectNode new
-		predicate: (PP2CharSetPredicate on: [ :char | '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char ]);
-		predicateMessage: 'punctuation expected';
-		negated: (PP2CharSetPredicate on: [ :char | ('.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~' includes: char) not ]);
-		negatedMessage: 'punctuation not expected';
-		yourself
+PP2NodeFactory >> punctuationCharacters [
+
+	^ '.,"''?!;:#$%&()*+-/<>=@[]\^_{}|~'
 ]