Tuesday 9 July 2013

Text Language Detection with JAVACC


AN ALGORITHM TO DETECT TEXT LANGUAGE

This algoritm uses a JAVACC parser combined with JAVA batch routines.

The program needs to read a long text to learn the lexical character of the
text language's words.  You can use a subroutine to check the language
of any word after the learning process.

Possible improvements could be keeping the data in a file and improving
the lexical knowledge by making the program read as many good texts as
possible.

 I have used Kur'an, Bible, Ataturk's Discourse and Of Mice and Men
as text.

You can find all the source code, input text files and the test outputs related to this work at
my sourceforge page:
https://sourceforge.net/projects/javacctxtdetect/files

REALIZATION STEPS:

1- Make a JAVACC parser which reads all the chars in the text file.

TOKEN :
{


  < LETTER: [ "a"-"z", "A"-"Z", "_"] | <TURK_CHAR> >
|
  < DIGIT:  [ "0"-"9" ] >
|  <TURK_CHAR: ["\u00f6","\u00d6","\u00e7","\u00c7","\u00fe","\u00de"
   ,"\u00dd","\u00fc","\u00dc","\u00d0","\u00fd","\u00c2","\u00db","\u00e2","\u00fb"
   ,"\u00ee","\u2013","\u00ce"
   ,"\u00ed","\u00fa","\u00ec","`","\u007f","\u2022","\u00b7","\u00e9","\u00ab","\u00bb"] | <SMALL_TURK_G> >
 | <#SMALL_TURK_G: "\u00f0">
//ö=\u00f6,Ö=\u00d6,ç=\u00e7,Ç=\u00c7,ş=\u00fe,Ş=\u00de,i,İ=\u00dd,ü=\u00fc,Ü=\u00dc,ğ=\u00f0,Ğ=\u00d0,ı=\u00fd,I
//Â=\u00c2,Û=\u00db,â=\u00e2,û=\u00fb,î=\u00ee,–=\u2013,Î=\u00ce
//Nutuk parazit harfleri í=\u00ed,ú=\u00fa,ì=\u00ec,`=(96),=\u007f,•=\u2022,=\u00b7,é=\u00e9,«=\u00ab,»=\u00bb
}
TOKEN : /* SEPARATORS */
{
   < LPAREN: "(" >
 | < RPAREN: ")" >
 | < LBRACE: "{" >
 | < RBRACE: "}" >
 | < LBRACKET: "[" >
 | < RBRACKET: "]" >
 | < SEMICOLON: ";" >
 | < COMMA: "," >
 | < HASH: "#" >
 | < DOT: "." >
 | < DOLLAR: "$" >
}
TOKEN : /* OPERATORS */
{
   < ASSIGN: "=" >
 | < GT: ">" >
 | < LT: "<" >
 | < BANG: "!" >
 | < TILDE: "~" >
 | < HOOK: "?" >
 | < COLON: ":" >
 | < LE: "<=" >
 | < GE: ">=" >
 | < NE: "<>" >
 | < PLUS: "+" >
 | < MINUS: "-" >
 | < STAR: "*" >
 | < SLASH: "/" >
 | < BIT_AND: "&" >
 | < PERCENT: "%" >
 | < BACKSLASH : "\\" >
 | < EXPO: "^" >
}
TOKEN : /* PUNCTUATION */
{
 <SINGLE_QUOTE_OPEN: "’">
 | <SINGLE_QUOTE_CLOSE: "‘">
 | <DOUBLE_QUOTE_OPEN: "“">
 |<DOUBLE_QUOTE_CLOSE: "”">
 |<DOUBLE_QUOTE: "\"">
 |<SINGLE_QUOTE: "'">
}
TOKEN : /* WHITESPACE */
{
 <SPACE: " ">
 | <TAB: "\t">
 | <RETURN: "\r">
 | <FORM: "\f">
 | <NEWLINE: "\n">
}
/** Root production. */
void Input() :
{/*System.out.println("Start");*/ Token t;}
{    
 (
  (
   t=<LETTER>
   | t=<DIGIT>
   | t=Separators()
   | t=Operators()
   | t=Punctuations()
   | t=WhiteSpace()
  )
  {count++; System.out.print(t.image);} 
 )*
  <EOF>
}
//*****************************************************
Token Separators() :
{/*System.out.println("Seperators=");*/Token t;}
{
 (
    t=<LPAREN>
  | t=<RPAREN>
  | t=<LBRACE>
  | t=<RBRACE>
  | t=<LBRACKET>
  | t=<RBRACKET>
  | t=<SEMICOLON>
  | t=<COMMA>
  | t=<HASH>
  | t=<DOT>
  | t=<DOLLAR>
 )
 {return(t);}
}
Token Operators() :
{/*System.out.println("Operators=");*/Token t;}
{
 (
  t=<ASSIGN>
  | t=<GT>
  | t=<LT>
  | t=<BANG>
  | t=<TILDE>
  | t=<HOOK>
  | t=<COLON>
  | t=<LE>
  | t=<GE>
  | t=<NE>
  | t=<PLUS>
  | t=<MINUS>
  | t=<STAR>
  | t=<SLASH>
  | t=<BIT_AND>
  | t=<PERCENT>
  | t=<BACKSLASH>
  | t=<EXPO>
 )
 {return(t);}
}
Token Punctuations() :
{/*System.out.println("Punctuations=");*/Token t;}
{
 (
  t=<SINGLE_QUOTE_OPEN>
  | t=<SINGLE_QUOTE_CLOSE>
  | t=<DOUBLE_QUOTE_OPEN>
  | t=<DOUBLE_QUOTE_CLOSE>
  | t=<DOUBLE_QUOTE>
  | t=<SINGLE_QUOTE>
 )
 {return(t);}
}
 
Token WhiteSpace() :
{/*System.out.println("WhiteSpace=");*/Token t;}
{
 (
  t=<SPACE>
  | t=<TAB> 
  | t=<RETURN> 
  | t=<FORM>
  | t=<NEWLINE>
 )
 {return(t);}
}

2- Make it count words:

/** Root production. */
void Input() :
{/*System.out.println("Start");*/ Token t;}
{    
 (
  (
   Word()
   | t=<DIGIT>  {countCharFrequency(t.image);}
   | t=Separators() {countCharFrequency(t.image);}
   | t=Operators() {countCharFrequency(t.image);}
   | t=Punctuations() {countCharFrequency(t.image);}
   | t=WhiteSpace()
  )
  {count++;} 
 )*
 {displayCharFrequency();}
  <EOF>
}
...
void Word() :
{/*System.out.println("Word=");*/Token t; String s="";}
{
 (
  ( LOOKAHEAD(2)
   t=<LETTER> {countCharFrequency(t.image);count++;}
      {s=s+t.image;}
  )+
  {word_count++;count--;}
 )
 {System.out.println("------------>"+s);}
}

3- Make it count char frequency:

public void countCharFrequency(String curChar)
 {
 int i =(int)curChar.charAt(0);

 //System.err.println("\n"+curChar+"="+ i);
 if (i < 256) charFreq[i]++;
 if (i == 8217) unicodeCharFreq[i-8200]++;
  else if (i == 8211)  unicodeCharFreq[i-8200]++;
  else if (i == 8216)  unicodeCharFreq[i-8200]++;
  else if (i == 8220)  unicodeCharFreq[i-8200]++;
  else if (i == 8221) unicodeCharFreq[i-8200]++;
  else if (i == 8226) unicodeCharFreq[i-8200]++;
  else if (i > 255)  {
        System.err.println("Unicode char PROBLEM++++++++++++++++++>");
        System.err.println("\n"+curChar+"="+ i);
       }
 }
 public void displayCharFrequency()
 {
  NumberFormat formatter = new DecimalFormat("#0.000");
  NumberFormat formatter0 = new DecimalFormat("#,###,###");
  double d=0;

  for (int  i=0;i < 255; i++)
  {
   if (charFreq[i]>0)
   {
    System.err.print((char)i+"=");
    System.err.format("%1$10s",formatter0.format(charFreq[i]));
    d=((double)charFreq[i] / count) * 100;
    System.err.print(" ---> %"+ formatter.format(d));
    for (int j=0;j< d;j++)
     System.err.print(" +");
    System.err.println(">");
   }
  }
 }

 ...

 /** Root production. */
void Input() :
{/*System.out.println("Start");*/ Token t;}
{    
 (
  (
   t=<LETTER> {countCharFrequency(t.image);}
   | t=<DIGIT>  {countCharFrequency(t.image);}
   | t=Separators() {countCharFrequency(t.image);}
   | t=Operators() {countCharFrequency(t.image);}
   | t=Punctuations() {countCharFrequency(t.image);}
   | t=WhiteSpace()
  )
  {count++; System.out.print(t.image);} 
 )*
 {displayCharFrequency();}
  <EOF>
}

4- Make it count words and list length > 20 words sorted.

 public void procWord(String wrd)
 {
  int i=0;

  for (i=0; i<wordArrSize;i++)
  {
   if ((wordArr[i].toUpperCase()).equals(wrd.toUpperCase()))
   {
    wordCountArr[i]++;
    break;
   }
  }

  if (i == wordArrSize)
  {
   wordArr[wordArrSize] = wrd;
   wordArrSize++;
   wordCountArr[i]++;
   System.out.println("wordArrSize="+wordArrSize);
  }
 }
 public void displayWordFrequency()
 {
  for (int i=0; i<wordArrSize;i++)
  {
   System.out.println(i+ "-->"+wordArr[i]+"="+ wordCountArr[i]);
  }
  /*
  for (int i=0; i<wordArrSize;i++)
  {
   if (wordCountArr[i]==1) System.out.println(i+ "-->"+wordArr[i]+"="+ wordCountArr[i]);
  }
  */
  bubbleSortWordFreq();
  System.err.format("\n\nTOT WORDS#=%1$7s%n",word_count);
  System.err.println("*************** WORDS referred > 20 times");

  NumberFormat formatter = new DecimalFormat("#0.000");
  NumberFormat formatter0 = new DecimalFormat("#,###,###");
  double d=0;

  for (int  i=0;i < wordArrSize; i++)
  {
   if (wordCountArr[i]>20)
   {
    System.err.print(String.format("%15s=",wordArr[i]));
    System.err.format("%1$10s",formatter0.format(wordCountArr[i]));
    d=((double)wordCountArr[i] / word_count) * 100;
    System.err.print(" ---> %"+ formatter.format(d));
    for (int j=0;j< d*10;j++)
     System.err.print(" +");
    System.err.println(">");
   }
  }
  System.err.format("  %1$10s%n","----------");
  System.err.format("TOT#=%1$7s%n",word_count);


/** Root production. */
void Input() :
{/*System.out.println("Start");*/ Token t;}
{    
 (
  (
   Word()
   | t=<DIGIT>  {countCharFrequency(t.image);}
   | t=Separators() {countCharFrequency(t.image);}
   | t=Operators() {countCharFrequency(t.image);}
   | t=Punctuations() {countCharFrequency(t.image);}
   | t=WhiteSpace()
  )
  {count++;} 
 )*
 {displayCharFrequency();}
 {bubbleSort(); displayWordFrequency();}
  <EOF>
}
void Word() :
{/*System.out.println("Word=");*/Token t; String s="";}
{
 (
  ( LOOKAHEAD(2)
   t=<LETTER> {countCharFrequency(t.image);count++;}
      {s=s+t.image;}
  )+
  {word_count++;count--;}
 )
 {System.out.println("------------>"+s);procWord(s);}
}

5- Make it count chars by position regardless of words:
This counts all a chars that are at position 1 and 2 usw 30.
The word boundaries are not taken into account.

6- Make it count chars by position on the basis of word length.
This counts all a chars that are at the position 1 and 2 usw 30
of word length 1 and 2 usw 30.

This is the final solution which gives the correct answer.

You can find all the source code and input text files and also the test
outputs  related to this work at my sourceforge page:
https://sourceforge.net/projects/javacctxtdetect/files