User:Aikoniv/hfst-optimized-lookup.notes

From Apertium
Jump to navigation Jump to search

Notes on various aspects of the hfst-optimized-lookup code

Describes file format, notes about operation: [1] [2]


.h:

typedef std::map<SymbolNumber,const char*> KeyTable;
enum FlagDiacriticOperator {P, N, R, D, C, U};
class TransducerHeader (line 93)
	- contains various flags/fields describing a transducer
	- constructor takes a FILE* and fread's the various properties
	- accessors to retrieve information
	- c.f. HeaderFlag
	
class FlagDiacriticOperation (line 190)
	- immutable, represents an operation
	- FlagDiacriticOperator operation;	SymbolNumber feature;	ValueNumber value;
	
class TransducerAlphabet (line 219)
	- stores a KeyTable, and information about flag diacritics (std::maps - feature_bucket, value_bucket)
	- constructor takes a FILE* and reads the KeyTable from it
	
class LetterTrie (line 272)
	- Each contains a list of SymbolNumber's and of more LetterTrie's
	- Has operations "add_string" and "find_key"
	
class Encoder (line 290)
	- used for symbol tokenization on input strings
	- contains a LetterTrie and a list of SymbolNumber's
	- has operation: SymbolNumber find_key(char** p)

class TransitionIndex (line 323)
	- contains an input symbol and a target index
	- bool final(void) { return first_transition_index == 1; }
	- bool matches(SymbolNumber s)
	
class Transition (line 359)
	- has an input symbol, output symbol, and target index
	- bool matches(SymbolNumber s)

class IndexTableReader (line 405)
	- has a list of TransitionIndex's
	- constructor reads in the table of transition indices from a FILE*
	
class TransitionTableReader (line 444)
	- has a list of Transition's
	- constructor reads in the table of transitions from a FILE*
	- contains an iterator over the transition list which can be advanced, used by various accessors
	
class Transducer (line 507)
	- contains TransducerHeader, TransducerAlphabet, key table, IndexTableReader, TransitionTableReader, Encoder,
	  list of display strings, symbol table, etc
	- constructor takes a FILE* and calls the constructors of various members with it to load the transducer:
		TransducerHeader, TransducerAlphabet, IndexTableReader, TransitionTableReader, Encoder
			    
	- virtual void printAnalyses(std::string prepend);

class TransducerUniq : public Transducer (line 609)
	- option '-u' = Suppress duplicate analyses
class TransducerFd : public Transducer (line 623)	- has FlagDiacriticStateStack and OperationVector
class TransducerFdUniq : public TransducerFd (line 643)

Here are the weighted (W) versions:

class TransitionWIndex (line 674)
	- like TransitionIndex but also with Weight final_weight(void)
	
class TransitionW (line 719)
	- like Transition but also with a Weight

class IndexTableReaderW (line 783)
class TransitionTableReaderW (line 822)

class TransducerW (line 885)
	- like Transducer, with weighted versions of members
	- has Weight current_weight, DisplayMultiMap instead of DisplayVector

class TransducerWUniq : public TransducerW (line 998)
class TransducerWFd : public TransducerW (line 1012)
class TransducerWFdUniq : public TransducerWFd (line 1035)

.cc:

bool print_usage(void) (line 28)
bool print_version(void) (line 61)
bool print_short_help(void) (line 71)

int main(int argc, char **argv) (line 77)
	- read/process arguments
	- try to open the input file and pass it to setup(FILE* f), exiting with setup's return value

void TransducerAlphabet::get_next_symbol(FILE * f, SymbolNumber k) (line 203)

void LetterTrie::add_string(const char * p, SymbolNumber symbol_key) (line 268)
SymbolNumber LetterTrie::find_key(char ** p) (line 282)

void Encoder::read_input_symbols(KeyTable * kt) (line 299)
SymbolNumber Encoder::find_key(char ** p) (line 315)


template <class genericTransducer> void runTransducer (genericTransducer T) (line 327)
	- allocate SymbolNumber* input_string, and char* str (= IO string)
	- loop over the input line by line until we reach the end or a tokenization error occurs
		- loop to convert the input character(s) into SymbolNumber's (T.find_next_key) -> input_string 
		   (terminated by a NO_SYMBOL_NUMBER)
		- bail if any character failed tokenization
		- do the analysis of the input_string [T.analyze(input_string); calls get_analyses]
		- print the analyses [T.printAnalyses(std:string(str))]
		

int setup(FILE * f) (line 382)
	- read TransducerHeader and TransducerAlphabet from input file
	- determine which transducer type to use:
		- determine if there are flag diacritics using alphabet.get_state_size()
		- determine if it is weighted using header.probe_flag(Weighted)
		- determine if we should suppress duplicates: displayUniqueFlag
	- create the correct transducer object passing in header and alphabet
	- call runTransducer with the new transducer object
bool TransducerFd::PushState(FlagDiacriticOperation op) (line 455)

bool TransitionIndex::matches(SymbolNumber s) (line 524)	- whether s matches our input symbol
bool Transition::matches(SymbolNumber s) (line 538)			- whether s matches our input symbol

void IndexTableReader::get_index_vector(void) (line 553)
	- convert the raw char* data read from the file into a vector
	
void TransitionTableReader::Set(TransitionTableIndex pos) (line 567)

void TransitionTableReader::get_transition_vector(void) (line 579)

bool TransitionTableReader::Matches(SymbolNumber s) (line 596)
bool TransitionTableReader::get_finality(TransitionTableIndex i) (line 602)

void Transducer::set_symbol_table(void) (line 614)
	- turn the key table (std::map<SymbolNumber,const char*>) into a symbol table (std::vector<const char*>)
	

**NOTES on the following algorithm implementation:	
	- input_symbol, output_symbol, and i refer to "current position" in the input string, output string, 
	   and tables respectively, while original_output_string refers to the beginning of the output string
	- TransitionTableIndex i can be referring to indices in either the transition index table or the 
	  transition table depending on the situation
		- if it is >= TRANSITION_TARGET_TABLE_START, it is referring to the i-TRANSITION_TARGET_TABLE_START'th
		  entry in the transition table
		- otherwise it is referring to the i'th entry in either the transition index table or the transition table:
		  *different functions expect different things*
	
void Transducer::try_epsilon_transitions(SymbolNumber * input_symbol,
					 SymbolNumber * output_symbol,
					 SymbolNumber * original_output_string,
					 TransitionTableIndex i) (line 627)
	- (i points to the first [i.e. potentially epsilon] entry of a state in the transition table)
	- for each transition with an epsilon input in this state (these are found at the start of the list of transitions)
		- append the transition output to the output string / advance output_symbol
		- call get_analyses with the same input_symbol and the transition target [which can be to either 
		  the transition index table (if there are multiple transition labels from the target state) or to the 
		  transition table (if there is only one transition label from the target state)]
										 
void TransducerFd::try_epsilon_transitions(SymbolNumber * input_symbol,
					   SymbolNumber * output_symbol,
					   SymbolNumber * original_output_string,
					   TransitionTableIndex i) (line 646)
	- same as above, plus flag diacritic handling

void Transducer::try_epsilon_indices(SymbolNumber * input_symbol,
				 SymbolNumber * output_symbol,
				 SymbolNumber * original_output_string,
				 TransitionTableIndex i) (line 697)
	- (i points to the first [i.e. potentially epsilon] entry for a state in the transition index table)
	- if the entry at i has an epsilon input
		- call try_epsilon_transitions with the same input_symbol/output_symbol and the target transition index
									 
void Transducer::find_transitions(SymbolNumber input,
				  SymbolNumber * input_symbol,
				  SymbolNumber * output_symbol,
				  SymbolNumber * original_output_string,
				  TransitionTableIndex i) (line 715)
	- (i points to the first entry of a "state" in the transition table: the current and following entries 
	   contain transitions until the next ∅ ∅ entry)
	- (input_symbol points to the symbol in the input string immediately following input [*(input_symbol-1)==input])
	- for each transition from from the current state
		- if the transition input matches our input
		- append the transition output to the output string / advange output_symbol
		- call get_analyses with the same input/input_symbol and the target of the transition
								  
void Transducer::find_index(SymbolNumber input,
			SymbolNumber * input_symbol,
			SymbolNumber * output_symbol,
			SymbolNumber * original_output_string,
			TransitionTableIndex i) (line 744)
	- (i points to a ∅ ∅ entry in the transition index table)
	- if the input value at i+input == input (if there is a transition from this state using this input)
		- call find_transitions with the same input/input_symbol/output_symbol, and the target index 
		   (pointing to transition table)

							
void Transducer::note_analysis(SymbolNumber * whole_output_string) (line 764)
	- create a string by looking up each SymbolNumber in the symbol_table, and add the string to the display_vector 
	- Equivalents:
		void TransducerUniq::note_analysis(SymbolNumber * whole_output_string) (line 784)
		void TransducerFdUniq::note_analysis(SymbolNumber * whole_output_string) (line 974)

void Transducer::get_analyses(SymbolNumber * input_symbol,
			  SymbolNumber * output_symbol,
			  SymbolNumber * original_output_string,
			  TransitionTableIndex i) (line 804)
	- if i is an index into the transition table [in which case it is pointing at a state's starting ∅ entry]
		- try_epsilon_transitions with i+1 [i+1 == the first actual transition entry for the current state; it may be epsilon]
		- if at end of input stream, note the analysis if i refers to a final transition, then return
		- call find_transitions on the next input symbol with i+1
	- else if i is an index into the transition index table [in which case it is pointing at a ∅ entry (although 
	  the compaction might mean that the actual memory value there isn't ∅)]
		- try_epsilon_indices with i+1 [i+1 == the first actual index for the current state; it may be epsilon]
		- if at end of input stream, note the analysis if i refers to a final index, then return
		- call find_index on the next input symbol with i+1

void Transducer::printAnalyses(std::string prepend) (line 879)
	- print the strings noted in display_vector to stdout
	- Equivalents:
		void TransducerUniq::printAnalyses(std::string prepend) (line 906)
		void TransducerFdUniq::printAnalyses(std::string prepend) (line 930)

Here are the weighted (W) versions:

bool TransitionWIndex::matches(SymbolNumber s) (line 958)
bool TransitionW::matches(SymbolNumber s) (line 972)
bool TransducerWFd::PushState(FlagDiacriticOperation op) (line 986)
void IndexTableReaderW::get_index_vector(void) (line 1052)
void TransitionTableReaderW::Set(TransitionTableIndex pos) (line 1066)
void TransitionTableReaderW::get_transition_vector(void) (line 1078)
bool TransitionTableReaderW::Matches(SymbolNumber s) (line 1100)
bool TransitionTableReaderW::get_finality(TransitionTableIndex i) (line 1106)
void TransducerW::set_symbol_table(void) (1119)
void TransducerW::try_epsilon_transitions(SymbolNumber * input_symbol,
					  SymbolNumber * output_symbol,
					  SymbolNumber * 
					  original_output_string,
					  TransitionTableIndex i) (line 1132)
void TransducerWFd::try_epsilon_transitions(SymbolNumber * input_symbol,
					    SymbolNumber * output_symbol,
					    SymbolNumber * 
					    original_output_string,
					    TransitionTableIndex i) (line 1161)
void TransducerW::try_epsilon_indices(SymbolNumber * input_symbol,
				      SymbolNumber * output_symbol,
				      SymbolNumber * original_output_string,
				      TransitionTableIndex i) (line 1216)
void TransducerW::find_transitions(SymbolNumber input,
				   SymbolNumber * input_symbol,
				   SymbolNumber * output_symbol,
				   SymbolNumber * original_output_string,
				   TransitionTableIndex i) (line 1234)
void TransducerW::find_index(SymbolNumber input,
			     SymbolNumber * input_symbol,
			     SymbolNumber * output_symbol,
			     SymbolNumber * original_output_string,
			     TransitionTableIndex i) (line 1270)
void TransducerW::note_analysis(SymbolNumber * whole_output_string) (line 1296)
	void TransducerWUniq::note_analysis(SymbolNumber * whole_output_string) (line 1308)
	void TransducerWFdUniq::note_analysis(SymbolNumber * whole_output_string) (line 1323)
void TransducerW::printAnalyses(std::string prepend) (line 1338)
	void TransducerWUniq::printAnalyses(std::string prepend) (line 1367)
	void TransducerWFdUniq::printAnalyses(std::string prepend) (1403)
void TransducerW::get_analyses(SymbolNumber * input_symbol,
			       SymbolNumber * output_symbol,
			       SymbolNumber * original_output_string,
			       TransitionTableIndex i) (line 1438)