Difference between revisions of "Lttoolbox API examples"

From Apertium
Jump to navigation Jump to search
(that version only works for 'beer')
 
(12 intermediate revisions by 3 users not shown)
Line 1: Line 1:
  +
[[Exemple d'application Lttoolbox]]
  +
  +
{{TOCD}}
  +
==Basic example==
  +
 
This is a simple C++/lttoolbox version of the 'beer' program in [[Morphological dictionaries]]
 
This is a simple C++/lttoolbox version of the 'beer' program in [[Morphological dictionaries]]
   
Line 22: Line 27:
 
// Alphabet class
 
// Alphabet class
 
#include <lttoolbox/alphabet.h>
 
#include <lttoolbox/alphabet.h>
 
// Pool class
 
#include <lttoolbox/pool.h>
 
   
 
// State class
 
// State class
Line 80: Line 82:
 
//Rather than try to convert between Transducer and TransExe, we'll
 
//Rather than try to convert between Transducer and TransExe, we'll
 
//just write and read.
 
//just write and read.
FILE* fst=fopen("beer.fst", "w");
+
FILE* fst=fopen("beer.fst", "w");
  +
Compression::wstring_write(L"ber", fst); //write letters
t.write(fst);
 
  +
alphabet.write(fst); // write multichars
  +
Compression::multibyte_write(1, fst); // write num. fsts
  +
Compression::wstring_write(L"main", fst); // write name of first fst
  +
t.write(fst); // write fst
 
fclose(fst);
 
fclose(fst);
 
fst=fopen("beer.fst", "r");
 
fst=fopen("beer.fst", "r");
Line 89: Line 95:
 
fclose(fst);
 
fclose(fst);
   
  +
State *initial_state = new State();
Pool<vector<int> > *pool = new Pool<vector<int> >(1, vector<int>(50));
 
State *initial_state = new State(pool);
 
 
initial_state->init(te.getInitial());
 
initial_state->init(te.getInitial());
 
State current_state = *initial_state;
 
State current_state = *initial_state;
Line 126: Line 131:
 
}
 
}
   
// Not used, just don't want it to be empty...
 
 
if (current_state.isFinal(anfinals))
 
if (current_state.isFinal(anfinals))
 
{
 
{
  +
// Not used, just don't want it to be empty...
 
set<wchar_t> escaped;
 
set<wchar_t> escaped;
 
escaped.insert(L'$');
 
escaped.insert(L'$');
Line 143: Line 148:
 
}
 
}
 
</pre>
 
</pre>
  +
  +
[[File:Ltbeer.png|center]]
  +
  +
we can simplify the building of the transducers like this:
  +
<pre>
  +
// build "beer" manually
  +
int beer = initial;
  +
// these are the transitions b:b e:e e:e r:r
  +
beer = t.insertSingleTransduction(alphabet(L'b',L'b'), beer);
  +
beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  +
beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  +
beer = t.insertSingleTransduction(alphabet(L'r',L'r'), beer);
  +
  +
int beersg = beer;
  +
// these are 0:<n> 0:<sg>
  +
beersg = t.insertSingleTransduction(alphabet(0, n_sym), beersg);
  +
beersg = t.insertSingleTransduction(alphabet(0, sg_sym), beersg);
  +
t.setFinal(beersg);
  +
  +
// build "beers" manually
  +
int beerpl = beer;
  +
beerpl = t.insertSingleTransduction(alphabet(L's', 0), beerpl);
  +
beerpl = t.insertSingleTransduction(alphabet(0, n_sym), beerpl);
  +
beerpl = t.insertSingleTransduction(alphabet(0, pl_sym), beerpl);
  +
t.setFinal(beerpl);
  +
</pre>
  +
  +
==Writing/reading several transducers to the same file==
  +
  +
<pre>
  +
  +
// g++ -o test test.cc -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3
  +
  +
#include <cwchar>
  +
#include <cstdio>
  +
#include <iostream>
  +
#include <set>
  +
  +
#include <lttoolbox/ltstr.h>
  +
#include <lttoolbox/lt_locale.h>
  +
#include <lttoolbox/transducer.h>
  +
#include <lttoolbox/alphabet.h>
  +
#include <lttoolbox/regexp_compiler.h>
  +
#include <lttoolbox/compression.h>
  +
  +
int main (int argc, char** argv)
  +
{
  +
map<int, Transducer> patterns;
  +
Alphabet a;
  +
RegexpCompiler re;
  +
  +
LtLocale::tryToSetLocale();
  +
  +
FILE *output = stdout;
  +
FILE *fst = fopen(argv[1], "w+");
  +
  +
// Build transducers
  +
  +
a.includeSymbol(L"<n>");
  +
  +
re.initialize(&a);
  +
re.compile(L"foo");
  +
patterns[1] = re.getTransducer();
  +
re.initialize(&a);
  +
re.compile(L"bar");
  +
patterns[2] = re.getTransducer();
  +
re.initialize(&a);
  +
re.compile(L"baz");
  +
patterns[3] = re.getTransducer();
  +
  +
// Write out the transducers
  +
  +
a.write(fst);
  +
Compression::multibyte_write(patterns.size(), fst);
  +
fwprintf(output, L"Patterns: %d, Alphabet: %d\n", patterns.size(), a.size());
  +
  +
for(map<int, Transducer>::iterator it = patterns.begin(); it != patterns.end(); it++)
  +
{
  +
wchar_t buf[50];
  +
memset(buf, '\0', sizeof(buf));
  +
swprintf(buf, 50, L"%d", it->first);
  +
wstring id(buf);
  +
fwprintf(output, L"= %S =============================\n", id.c_str());
  +
it->second.show(a, output);
  +
  +
Compression::wstring_write(id, fst);
  +
it->second.write(fst);
  +
}
  +
  +
fclose(fst);
  +
fwprintf(output, L"\n\n");
  +
  +
// Now read in the transducers that we have written out, one by one.
  +
  +
FILE *new_fst = fopen(argv[1], "r");
  +
Alphabet new_alphabet;
  +
map<wstring, Transducer> transducers;
  +
  +
new_alphabet.read(new_fst);
  +
int len = Compression::multibyte_read(new_fst);
  +
  +
while(len > 0)
  +
{
  +
int len2 = Compression::multibyte_read(new_fst);
  +
wstring name = L"";
  +
while(len2 > 0)
  +
{
  +
name += static_cast<wchar_t>(Compression::multibyte_read(new_fst));
  +
len2--;
  +
}
  +
transducers[name].read(new_fst);
  +
len--;
  +
}
  +
fwprintf(output, L"Patterns: %d, Alphabet: %d\n", transducers.size(), new_alphabet.size());
  +
  +
for(map<wstring, Transducer>::iterator it = transducers.begin(); it != transducers.end(); it++)
  +
{
  +
fwprintf(output, L"= %S =============================\n", it->first.c_str());
  +
it->second.minimize();
  +
it->second.show(a, output);
  +
}
  +
  +
fclose(new_fst);
  +
  +
return 0;
  +
}
  +
</pre>
  +
  +
==Using regular expressions==
  +
  +
<pre>
  +
/*
  +
* g++ -o lt-regexp lt-regexp.cc -I/usr/include/libxml2 -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3 -llibxml2
  +
*/
  +
#include <cwchar>
  +
#include <cstdio>
  +
#include <cerrno>
  +
#include <string>
  +
#include <iostream>
  +
#include <list>
  +
#include <set>
  +
  +
#include <lttoolbox/ltstr.h>
  +
#include <lttoolbox/lt_locale.h>
  +
#include <lttoolbox/transducer.h>
  +
#include <lttoolbox/alphabet.h>
  +
#include <lttoolbox/state.h>
  +
#include <lttoolbox/regexp_compiler.h>
  +
#include <lttoolbox/match_exe.h>
  +
#include <lttoolbox/match_state.h>
  +
#include <lttoolbox/xml_parse_util.h>
  +
  +
wstring ws(char *arg)
  +
{
  +
wchar_t buf[1024];
  +
memset(buf, '\0', 1024);
  +
size_t num_chars = mbstowcs(buf, arg, strlen(arg));
  +
wstring ws(buf, num_chars);
  +
return ws;
  +
}
  +
  +
bool match(Transducer t, wstring str, Alphabet a)
  +
{
  +
map<int, int> finals;
  +
for(int i = 0; i < t.size(); i++)
  +
{
  +
if(!t.isFinal(i))
  +
{
  +
continue;
  +
}
  +
finals[i] = i;
  +
}
  +
MatchExe me(t, finals);
  +
MatchState ms;
  +
ms.clear();
  +
ms.init(me.getInitial());
  +
  +
for(wstring::iterator it = str.begin(); it != str.end(); it++)
  +
{
  +
wcout << ms.size() << " " << *it << endl;
  +
ms.step(a(*it, *it));
  +
}
  +
int val = ms.classifyFinals(me.getFinals());
  +
fwprintf(stdout, L"%d\n", val);
  +
  +
if(val != -1)
  +
{
  +
return true;
  +
}
  +
return false;
  +
}
  +
  +
int main (int argc, char** argv)
  +
{
  +
Alphabet alphabet;
  +
Transducer t;
  +
RegexpCompiler re;
  +
bool matched;
  +
  +
LtLocale::tryToSetLocale();
  +
  +
if(argc < 3)
  +
{
  +
wcout << L"Usage: lt-regexp <pattern> <string to match>" << endl;
  +
exit(-1);
  +
}
  +
  +
FILE *output = stdout;
  +
wstring pattern = ws(argv[1]);
  +
wstring s = ws(argv[2]);
  +
  +
re.initialize(&alphabet);
  +
re.compile(pattern);
  +
t = re.getTransducer();
  +
t.minimize();
  +
  +
t.show(alphabet, output);
  +
  +
matched = match(t, s, alphabet);
  +
  +
wcout << endl << pattern << " " << s << endl;
  +
}
  +
  +
</pre>
  +
  +
[[Category:Lttoolbox]]
  +
[[Category:Documentation in English]]

Latest revision as of 07:59, 5 July 2017

Exemple d'application Lttoolbox

Basic example[edit]

This is a simple C++/lttoolbox version of the 'beer' program in Morphological dictionaries

// g++ -I/usr/local/include/lttoolbox-3.2 -I/usr/local/lib -llttoolbox3 lt_beer.cc -o lt-beer

#include <cwchar>
#include <cstdio>
#include <cerrno>
#include <string>
#include <iostream>
#include <list>
#include <set>

#include <lttoolbox/ltstr.h>

// LtLocale::tryToSetLocale()
#include <lttoolbox/lt_locale.h>

// Transducer class
#include <lttoolbox/transducer.h>

// Alphabet class
#include <lttoolbox/alphabet.h>

// State class
#include <lttoolbox/state.h>

// TransExe class
#include <lttoolbox/trans_exe.h>

int main (int argc, char** argv)
{
  Alphabet alphabet;
  Transducer t;

  // Set locale
  LtLocale::tryToSetLocale();

  // Include symbols into alphabet, keeping the values
  alphabet.includeSymbol(L"<n>");
  alphabet.includeSymbol(L"<sg>");
  alphabet.includeSymbol(L"<pl>");

  int n_sym = alphabet(L"<n>");
  int sg_sym = alphabet(L"<sg>");
  int pl_sym = alphabet(L"<pl>");
  // Initial state
  int initial = t.getInitial();

  // build "beer" manually
  int beer = initial;
  // these are the transitions b:b e:e e:e r:r
  beer = t.insertSingleTransduction(alphabet(L'b',L'b'), beer);
  beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  beer = t.insertSingleTransduction(alphabet(L'r',L'r'), beer);
  // these are 0:<n> 0:<sg>
  beer = t.insertSingleTransduction(alphabet(0, n_sym), beer);
  beer = t.insertSingleTransduction(alphabet(0, sg_sym), beer);
  t.setFinal(beer);

  // build "beers" manually
  int beers = t.getInitial();
  beers = t.insertSingleTransduction(alphabet(L'b',L'b'), beers);
  beers = t.insertSingleTransduction(alphabet(L'e',L'e'), beers);
  beers = t.insertSingleTransduction(alphabet(L'e',L'e'), beers);
  beers = t.insertSingleTransduction(alphabet(L'r',L'r'), beers);

  // this transition is s:0
  beers = t.insertSingleTransduction(alphabet(L's', 0), beers);
  beers = t.insertSingleTransduction(alphabet(0, n_sym), beers);
  beers = t.insertSingleTransduction(alphabet(0, pl_sym), beers);
  t.setFinal(beers);

  t.minimize();

  //Rather than try to convert between Transducer and TransExe, we'll
  //just write and read.
  FILE* fst=fopen("beer.fst", "w");       
  Compression::wstring_write(L"ber", fst);   //write letters
  alphabet.write(fst); // write multichars
  Compression::multibyte_write(1, fst); // write num. fsts
  Compression::wstring_write(L"main", fst);  // write name of first fst
  t.write(fst); // write fst
  fclose(fst);
  fst=fopen("beer.fst", "r");

  TransExe te;
  te.read(fst, alphabet);
  fclose(fst);

  State *initial_state = new State();
  initial_state->init(te.getInitial());
  State current_state = *initial_state;

  wstring input, output=L"";

  set<Node *> anfinals;
  anfinals.insert(te.getFinals().begin(), te.getFinals().end());

  FILE* in=stdin;

  bool reading=true;
  // This is our runtime: see if the input matches
  while (reading)
  {
    wchar_t val = (wchar_t)fgetwc(in);
    if(val==WEOF||iswspace(val))
    {
      reading=false;
    }
    else
    {
      if (!reading)
      {
        // At the end. We don't need to do anything but
        // leave the loop in this simplistic example
        break;
      }
      else
      {
        current_state.step(val);
        alphabet.getSymbol(input, val);
      }
    }
  }

  if (current_state.isFinal(anfinals))
  {
    // Not used, just don't want it to be empty...
    set<wchar_t> escaped;
    escaped.insert(L'$');
    output = current_state.filterFinals(anfinals, alphabet, escaped);

    wcout << input << output << endl;
  }
  else
  {
    wcout << L"Unrecognised: " << input << endl;
  }

  return 0;
}
Ltbeer.png

we can simplify the building of the transducers like this:

  // build "beer" manually
  int beer = initial;
  // these are the transitions b:b e:e e:e r:r
  beer = t.insertSingleTransduction(alphabet(L'b',L'b'), beer);
  beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  beer = t.insertSingleTransduction(alphabet(L'e',L'e'), beer);
  beer = t.insertSingleTransduction(alphabet(L'r',L'r'), beer);

  int beersg = beer;
  // these are 0:<n> 0:<sg>
  beersg = t.insertSingleTransduction(alphabet(0, n_sym), beersg);
  beersg = t.insertSingleTransduction(alphabet(0, sg_sym), beersg);
  t.setFinal(beersg);

  // build "beers" manually
  int beerpl = beer;
  beerpl = t.insertSingleTransduction(alphabet(L's', 0), beerpl);
  beerpl = t.insertSingleTransduction(alphabet(0, n_sym), beerpl);
  beerpl = t.insertSingleTransduction(alphabet(0, pl_sym), beerpl);
  t.setFinal(beerpl);

Writing/reading several transducers to the same file[edit]


// g++ -o test test.cc -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3 

#include <cwchar>
#include <cstdio>
#include <iostream>
#include <set>

#include <lttoolbox/ltstr.h>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/transducer.h>
#include <lttoolbox/alphabet.h>
#include <lttoolbox/regexp_compiler.h>
#include <lttoolbox/compression.h>

int main (int argc, char** argv)
{
  map<int, Transducer> patterns;
  Alphabet a;
  RegexpCompiler re;

  LtLocale::tryToSetLocale();

  FILE *output = stdout;
  FILE *fst = fopen(argv[1], "w+");

  // Build transducers

  a.includeSymbol(L"<n>");
  
  re.initialize(&a);  
  re.compile(L"foo");
  patterns[1] = re.getTransducer();
  re.initialize(&a);  
  re.compile(L"bar");
  patterns[2] = re.getTransducer();
  re.initialize(&a);  
  re.compile(L"baz");
  patterns[3] = re.getTransducer();

  // Write out the transducers

  a.write(fst);
  Compression::multibyte_write(patterns.size(), fst);
  fwprintf(output, L"Patterns: %d, Alphabet: %d\n", patterns.size(), a.size());

  for(map<int, Transducer>::iterator it = patterns.begin(); it != patterns.end(); it++)
  {
    wchar_t buf[50];
    memset(buf, '\0', sizeof(buf));
    swprintf(buf, 50, L"%d", it->first);
    wstring id(buf);
    fwprintf(output, L"= %S =============================\n", id.c_str());
    it->second.show(a, output);
  
    Compression::wstring_write(id, fst);
    it->second.write(fst);
  }

  fclose(fst);
  fwprintf(output, L"\n\n");

  // Now read in the transducers that we have written out, one by one.

  FILE *new_fst = fopen(argv[1], "r");
  Alphabet new_alphabet;
  map<wstring, Transducer> transducers;

  new_alphabet.read(new_fst);
  int len = Compression::multibyte_read(new_fst);  

  while(len > 0)
  { 
    int len2 = Compression::multibyte_read(new_fst);
    wstring name = L"";
    while(len2 > 0)
    {
      name += static_cast<wchar_t>(Compression::multibyte_read(new_fst));
      len2--;
    }
    transducers[name].read(new_fst);
    len--;
  }
  fwprintf(output, L"Patterns: %d, Alphabet: %d\n", transducers.size(), new_alphabet.size());

  for(map<wstring, Transducer>::iterator it = transducers.begin(); it != transducers.end(); it++)
  {
    fwprintf(output, L"= %S =============================\n", it->first.c_str());
    it->second.minimize();
    it->second.show(a, output);
  }

  fclose(new_fst);
  
  return 0;
}

Using regular expressions[edit]

/* 
 * g++ -o lt-regexp lt-regexp.cc -I/usr/include/libxml2 -I/home/fran/local/include/lttoolbox-3.2 -L/home/fran/local/lib -llttoolbox3 -llibxml2
 */
#include <cwchar>
#include <cstdio>
#include <cerrno>
#include <string>
#include <iostream>
#include <list>
#include <set>

#include <lttoolbox/ltstr.h>
#include <lttoolbox/lt_locale.h>
#include <lttoolbox/transducer.h>
#include <lttoolbox/alphabet.h>
#include <lttoolbox/state.h>
#include <lttoolbox/regexp_compiler.h>
#include <lttoolbox/match_exe.h>
#include <lttoolbox/match_state.h>
#include <lttoolbox/xml_parse_util.h>

wstring ws(char *arg)
{
  wchar_t buf[1024];
  memset(buf, '\0', 1024);
  size_t num_chars = mbstowcs(buf, arg, strlen(arg));
  wstring ws(buf, num_chars);
  return ws;
}

bool match(Transducer t, wstring str, Alphabet a)
{
  map<int, int> finals;
  for(int i = 0; i < t.size(); i++) 
  {
    if(!t.isFinal(i))
    {
      continue;
    }
    finals[i] = i;
  }
  MatchExe me(t, finals);
  MatchState ms;
  ms.clear();
  ms.init(me.getInitial());

  for(wstring::iterator it = str.begin(); it != str.end(); it++) 
  {
    wcout << ms.size() << " " << *it << endl;
    ms.step(a(*it, *it));
  }
  int val = ms.classifyFinals(me.getFinals());
  fwprintf(stdout, L"%d\n", val);
  
  if(val != -1) 
  { 
    return true;
  }
  return false;
}

int main (int argc, char** argv)
{
  Alphabet alphabet;
  Transducer t;
  RegexpCompiler re;
  bool matched;

  LtLocale::tryToSetLocale();

  if(argc < 3) 
  {
    wcout << L"Usage: lt-regexp <pattern> <string to match>" << endl;
    exit(-1);
  }

  FILE *output = stdout;
  wstring pattern = ws(argv[1]);
  wstring s = ws(argv[2]);

  re.initialize(&alphabet);
  re.compile(pattern);
  t = re.getTransducer();
  t.minimize();

  t.show(alphabet, output);

  matched = match(t, s, alphabet); 

  wcout << endl << pattern << " " << s << endl;
}