Here is my solution to the
problem. It is fairly easy and simple to
understand...
TurkCharTestInp.txt is the test input file:
*********************************
ğ
ğ
ğ
ğ
a
a
ö
Ö
ç
Ç
ş
Ş
i
İ
ü
Ü
ğ
Ğ
ı
I
a
a
a
TurkCharOut.txt is the output produced by my parser.
********************************************
Start
Turkish g in
System.out.println=ğğğğğğ
Turkish g as t.image=ğ
UTF8 char embedded in System.out.println("\u00f0")=(ğ)
CHAR=ğ<-- \u00f0
CHAR=ğ<-- \u00f0
CHAR=ğ<-- \u00f0
CHAR=a
CHAR=a
CHAR=ö<-- \u00f6
CHAR=Ö<-- \u00d6
CHAR=ç<-- \u00e7
CHAR=Ç<-- \u00c7
CHAR=ş<-- \u00fe
CHAR=Ş<-- \u00de
CHAR=i
CHAR=İ<-- \u00dd
CHAR=ü<-- \u00fc
CHAR=Ü<-- \u00dc
CHAR=ğ<-- \u00f0
CHAR=Ğ<-- \u00d0
CHAR=ı<-- \u00fd
CHAR=I
CHAR=a
CHAR=a
CHAR=a
TurkChars.jj is my parser’s JAVACC grammer file.
****************************************
options {
BUILD_PARSER=true;
UNICODE_INPUT=true;
STATIC
= false;
}
PARSER_BEGIN(TurkChars
// by Ali Riza SARAL
// This JAVACC file handles
non-standard Turkish chars.
import java.io.*;
public
class TurkChars {
public static void main(String args[])
throws
ParseException, UnsupportedEncodingException, FileNotFoundException {
java.io.InputStream
is = new java.io.FileInputStream("TurkCharTestInp.txt");
java.io.Reader
r = new java.io.InputStreamReader(is, "Cp1252"); // Windows-1254
UTF-8 ISO-8859-9 does not work
SimpleCharStream
scs = new SimpleCharStream(r);
TurkCharsTokenManager
mgr = new TurkCharsTokenManager(scs);
TurkChars
parser = new TurkChars(mgr);
parser.Input();
}
}
PARSER_END(TurkChars)
TOKEN : {
<CR_LF: "\r\n">
}
TOKEN : {
<CHAR
: (["A"-"Z","a"-"z"])+>
|
<TURK_CHAR:
["\u00f6","\u00d6","\u00e7","\u00c7","\u00fe","\u00de"
,"\u00dd","\u00fc","\u00dc","\u00d0","\u00fd"]
| <SMALL_TURK_G> >
|
<#SMALL_TURK_G: "\u00f0">
}
//*****************************************************
void Input() :
{System.out.println("Start");
Token t;}
{
(t=<CHAR>
| t=<TURK_CHAR>)
{
System.out.println("Turkish
g in System.out.println=ğğğğğğ");
System.out.println("Turkish
g as t.image="+t.image);
System.out.println("UTF8
char embedded in
System.out.println(\"\\u00f0\")=" +
"(\u00f0)");
} <CR_LF>
(
LOOKAHEAD(2)(t=<CHAR>|
t=<TURK_CHAR>)
{ System.out.print("CHAR="+t.image);
if
(t.image.equals("ğ")) System.out.println("<--
\\u00f0");
else
if (t.image.equals("ö")) System.out.println("<--
\\u00f6");
else
if (t.image.equals("Ö")) System.out.println("<--
\\u00d6");
else
if (t.image.equals("ç")) System.out.println("<--
\\u00e7");
else
if (t.image.equals("Ç")) System.out.println("<--
\\u00c7");
else
if (t.image.equals("ş")) System.out.println("<--
\\u00fe");
else
if (t.image.equals("Ş")) System.out.println("<--
\\u00de");
else
if (t.image.equals("İ")) System.out.println("<--
\\u00dd");
else
if (t.image.equals("ü")) System.out.println("<--
\\u00fc");
else
if (t.image.equals("Ü")) System.out.println("<--
\\u00dc");
else
if (t.image.equals("Ğ")) System.out.println("<--
\\u00d0");
else
if (t.image.equals("ı")) System.out.println("<--
\\u00fd");
else
System.out.println("");
}
<CR_LF>
)*
<EOF>
}