Friday, 10 May 2013

JAVACC Character Encoding - Localization Solution

Many languages use characters in their alphabets that are not included in standard character sets.  Turkis language is no exception of this.  JAVACC provides utilities to handle these difficulties.

Here is my solution to the problem.  It is fairly easy and simple to understand...

TurkCharTestInp.txt   is the test input file:
*********************************
ğ
ğ
ğ
ğ
a
a
ö
Ö
ç
Ç
ş
Ş
i
İ
ü
Ü
ğ
Ğ
ı
I
a
a
a
 
TurkCharOut.txt  is the output produced by my parser.
********************************************
Start
Turkish g in System.out.println=ğğğğğğ
Turkish g as t.image=ğ
UTF8 char embedded in  System.out.println("\u00f0")=(ğ)
CHAR=ğ<-- \u00f0
CHAR=ğ<-- \u00f0
CHAR=ğ<-- \u00f0
CHAR=a
CHAR=a
CHAR=ö<-- \u00f6
CHAR=Ö<-- \u00d6
CHAR=ç<-- \u00e7
CHAR=Ç<-- \u00c7
CHAR=ş<-- \u00fe
CHAR=Ş<-- \u00de
CHAR=i
CHAR=İ<-- \u00dd
CHAR=ü<-- \u00fc
CHAR=Ü<-- \u00dc
CHAR=ğ<-- \u00f0
CHAR=Ğ<-- \u00d0
CHAR=ı<-- \u00fd
CHAR=I
CHAR=a
CHAR=a
CHAR=a

TurkChars.jj   is my parser’s JAVACC grammer file.
****************************************
options {
                BUILD_PARSER=true;
                UNICODE_INPUT=true;
                STATIC = false;
}

PARSER_BEGIN(TurkChars

// by Ali Riza SARAL
// This JAVACC file handles non-standard Turkish chars.
 
import java.io.*;
 
                public class TurkChars {

                  public static void main(String args[])
                               throws ParseException, UnsupportedEncodingException, FileNotFoundException {
                           
                               java.io.InputStream is = new java.io.FileInputStream("TurkCharTestInp.txt");
                               java.io.Reader r = new java.io.InputStreamReader(is, "Cp1252"); // Windows-1254 UTF-8 ISO-8859-9 does not work
                               SimpleCharStream scs = new SimpleCharStream(r);
                               TurkCharsTokenManager mgr = new TurkCharsTokenManager(scs);

                               TurkChars parser = new TurkChars(mgr);
                               parser.Input();
                }
}

PARSER_END(TurkChars)  
 
TOKEN : {
  <CR_LF: "\r\n">
}

TOKEN : {
                <CHAR : (["A"-"Z","a"-"z"])+>
                | <TURK_CHAR: ["\u00f6","\u00d6","\u00e7","\u00c7","\u00fe","\u00de"
                                  ,"\u00dd","\u00fc","\u00dc","\u00d0","\u00fd"] | <SMALL_TURK_G> >
                | <#SMALL_TURK_G: "\u00f0">
}
 
//*****************************************************

void Input() :
{System.out.println("Start");
Token t;}
{
                (t=<CHAR> | t=<TURK_CHAR>)
                               {
                                               System.out.println("Turkish g in System.out.println=ğğğğğğ");
                                               System.out.println("Turkish g as t.image="+t.image);
                                               System.out.println("UTF8 char embedded in  System.out.println(\"\\u00f0\")=" + "(\u00f0)");
                               }     <CR_LF>
                (             
                               LOOKAHEAD(2)(t=<CHAR>| t=<TURK_CHAR>)
                               {              System.out.print("CHAR="+t.image);
                                               if (t.image.equals("ğ")) System.out.println("<-- \\u00f0");
                                               else if (t.image.equals("ö")) System.out.println("<-- \\u00f6");
                                               else if (t.image.equals("Ö")) System.out.println("<-- \\u00d6");
                                               else if (t.image.equals("ç")) System.out.println("<-- \\u00e7");
                                               else if (t.image.equals("Ç")) System.out.println("<-- \\u00c7");
                                               else if (t.image.equals("ş")) System.out.println("<-- \\u00fe");
                                               else if (t.image.equals("Ş")) System.out.println("<-- \\u00de");
                                               else if (t.image.equals("İ")) System.out.println("<-- \\u00dd");
                                               else if (t.image.equals("ü")) System.out.println("<-- \\u00fc");
                                               else if (t.image.equals("Ü")) System.out.println("<-- \\u00dc");
                                               else if (t.image.equals("Ğ")) System.out.println("<-- \\u00d0");
                                               else if (t.image.equals("ı")) System.out.println("<-- \\u00fd");
                                               else System.out.println("");
                               }
                               <CR_LF>
                )*
                <EOF>
}