/******************************************************************************
* Parser to convert roman transliterations into Unicode Devnagari
* Copyright (C) 2003 Roshan Kamath
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
******************************************************************************/
/******************************************************************************
* BEGIN DEFINITIONS
******************************************************************************/
/*
** First the Local Defines that will be used in the program
*/
#define VIRAM "्"
#define ZWNJ "" /* Zero Width Non Joiner */
#define ZWJ "" /* Zero Width Joiner */
/*
** Prototype Declarations
*/
int isVowel(int);
int isConsonant(int);
void addHamza(void);
void addViram(void);
void reduceVowel(void);
void encode(char*);
void convh2s();
void adjustDanda();
void correctAnusvar();
/*
** Use a "flag" to detect when to do Transliteration from English to UCS
** This global flag is set whenever we detect a tag and unset upon a
*/
int transliterate = 0;
/*
** Use a "flag" to detect when to use the ZWNJ. This global flag is toggled
** everytime we encounter the '\'. Effectively, text enclosed within the
** '\' will not use the ZWNJ. Note that the default value is 1!
*/
int useNonJoiner = 1;
/*
** Use a flag to detect if we should add the implicit virAms. This flag is set
** upon encountering the tag and unset upon a
*/
int noViram = 0;
/******************************************************************************
* END DEFINITIONS
******************************************************************************/
%%
/******************************************************************************
* BEGIN RULES
******************************************************************************/
/*
** Define token to Skip trailing white spaces
*/
[ \t]+$
/*
** Replace white spaces by a single space. Add a Hamza (') if a Vowel
** follows the white spaces
*/
[ \t\r]+ printf(" "); addHamza();
/*
** Detect when to start and stop transliteration. Add a Hamza (')
** if a Vowel follows
*/
"" transliterate = 1; addHamza(); /* Set to TRUE */
"" transliterate = 0; /* Set to FALSE */
/*
** Detect when to start and stop adding the implicit virAms.
*/
"" noViram = 1; /* Set to TRUE */
"" noViram = 0; /* Set to FALSE */
/*
** ECHO all HTML tags. Add a Hamza (') if a Vowel follows the tags
*/
\<[^\>]*\> ECHO; addHamza();
/*
** Detect when to use the ZWNJ
*/
\\ useNonJoiner = !useNonJoiner;
/*
** Cover all punctuation marks. Update this list regularly :)
*/
[-~!@#$%&*\(\)_+=\{\}\[\];\"\<,?/] ECHO; addHamza();
/*
** Special Cases because UCS doesn't have fonts for these or these are
** special symbols that I use in context
*/
` {
if (!transliterate)
ECHO;
else {
/* Use .a for the Glottal Stop */
addHamza(); unput('a'); unput('.');
}
}
w {
/* Use v.d for w */
if (transliterate) {
unput('d'); unput('.');
unput('v');
} else
ECHO;
}
/*
** Begin UCS Encodings here 0x0901 to 0x097F. Note that the encode function
** uses the transliterate flag to determine whether to encode or not. This
** is also true for the other helpers like addViram(), addHamza() etc.
**
** In case of a consonant, we add a virAm (.h) in case it is not followed by
** a vowel. This can be inhibited by the tags.
**
** For a vowel, we convert any following .h to .s This is only to maintain
** backward compatibility with my personal Jtrans convention. [This has
** however been temporariy disabled.]
**
** Note: Since UCS does not have the short versions of the aE and aO vowels
** we represent the short versions by using the _ accent! To maintain
** consistency, even the short version of E and O are rendered similarly.
**
** Finally, we eliminate the 'a' vowel directly since the default glyph
** already had the vowel implicitly present. Ofcourse, this is not done
** blindly :)
*/
\.n encode("ँ");
/* 2306 exists, but I like to change the .N to .N^k or .N^j */
\.N correctAnusvar();
/* 2307 is the ':' But I shall use the standard ASCII version */
/* 2308 is a Hole */
'a encode("अ"); convh2s();
'A encode("आ"); convh2s();
'i encode("इ");
'I encode("ई"); convh2s();
'u encode("उ");
'U encode("ऊ"); convh2s();
'\.r encode("ऋ");
'\.l encode("ऌ");
/* 2317 is not used */
/* 2318 is usable, but using it causes confusion/problems */
'e { /* Use 'E.s for 'e */
if (transliterate) {
reduceVowel(); unput('E'); unput('\'');
} else
ECHO;
}
'E encode("ए"); convh2s();
/* Use 'aE.s for 'ae */
'ae {
if (transliterate) {
reduceVowel(); unput('E'); unput('a'); unput('\'');
} else
ECHO;
}
'aE encode("ऐ"); convh2s();
/* 2321 is not used */
/* 2322 is usable, but using it causes confusion/problems */
'o { /* Use 'O.s for 'o */
if (transliterate) {
reduceVowel(); unput('O'); unput('\'');
} else
ECHO;
}
'O encode("ओ"); convh2s();
/* Use 'aO.s for 'ao */
'ao {
if (transliterate) {
reduceVowel(); unput('O'); unput('a'); unput ('\'');
} else
ECHO;
}
'aO encode("औ"); convh2s();
k encode("क"); addViram();
kH encode("ख"); addViram();
g encode("ग"); addViram();
gH encode("घ"); addViram();
\.N^k encode("ङ"); addViram();
Ch encode("च"); addViram();
ChH encode("छ"); addViram();
j encode("ज"); addViram();
jH encode("झ"); addViram();
\.N^j encode("ञ"); addViram();
T encode("ट"); addViram();
TH encode("ठ"); addViram();
D encode("ड"); addViram();
DH encode("ढ"); addViram();
N encode("ण"); addViram();
t encode("त"); addViram();
tH encode("थ"); addViram();
d encode("द"); addViram();
dH encode("ध"); addViram();
n encode("न"); addViram();
/* 2345 is not used */
p encode("प"); addViram();
pH encode("फ"); addViram();
b encode("ब"); addViram();
bH encode("भ"); addViram();
m encode("म"); addViram();
y encode("य"); addViram();
r encode("र"); addViram();
/* 2353 is not used */
l encode("ल"); addViram();
L encode("ळ"); addViram();
/* 2356 is not used */
v encode("व"); addViram();
Sh encode("श"); addViram();
Xh encode("ष"); addViram();
s encode("स"); addViram();
h encode("ह"); addViram();
/* 2362 is a Hole */
/* 2363 is a Hole */
\.d encode("़"); addViram(); /* This is usually NEVER part of external
input */
\.a encode("ऽ"); /* This is NEVER part of external input */
a {
if (!transliterate) {
ECHO;
} else {
/* Consume any 'a' symbols judiciously since they are irrelevant
in UCS */
adjustDanda();
convh2s(); /* In case there is a .h following */
}
}
A encode("ा"); convh2s();
i encode("ि");
I encode("ी"); convh2s();
u encode("ु");
U encode("ू"); convh2s();
\.r encode("ृ");
\.R encode("ॄ"); convh2s();
/* 2373 is not used */
/* 2374 is usable, but using it causes confusion/problems */
e { /* Use E.s for e */
if (transliterate) {
reduceVowel(); unput('E');
} else
ECHO;
}
E encode("े"); convh2s();
/* Use aE.s for ae */
ae {
if (transliterate) {
reduceVowel(); unput('E'); unput('a');
} else
ECHO;
}
aE encode("ै"); convh2s();
/* 2377 is not used */
/* 2378 is usable, but using it causes confusion/problems */
o { /* Use O.s for o */
if (transliterate) {
reduceVowel(); unput('O');
} else
ECHO;
}
O encode("ो"); convh2s();
/* Use aO.s for ao */
ao {
if (transliterate) {
reduceVowel(); unput('O'); unput('a');
} else
ECHO;
}
aO encode("ौ"); convh2s();
\.h {
if (transliterate) {
printf("%s",VIRAM);
/* Now add the NonJoiner only if it is not inhibited */
if (useNonJoiner) printf("%s",ZWNJ);
} else
ECHO;
}
/* 2382 is a Hole */
/* 2383 is a Hole */
/* 2384 is not used */
/* 2385 is not used */
\.s encode("॒");
/* 2387 is not used */
/* 2388 is not used */
/* 2389 is a Hole */
/* 2390 is a Hole */
/* 2391 is a Hole */
q encode("क़"); addViram();
Kh encode("ख़"); addViram();
Gh encode("घ"); encode("़"); addViram();
z encode("ज़"); addViram();
R encode("ड़"); addViram();
RH encode("ढ़"); addViram();
f encode("फ़"); addViram();
/* 2399 is not used */
'\.R encode("ॠ"); convh2s();
'\.L encode("ॡ"); convh2s();
\.l encode("ॢ");
\.L encode("ॣ"); convh2s();
\| encode("।");
\|\| encode("॥");
0 encode("०");
1 encode("१");
2 encode("२");
3 encode("३");
4 encode("४");
5 encode("५");
6 encode("६");
7 encode("७");
8 encode("८");
9 encode("९");
\. encode("॰");
/******************************************************************************
* END RULES
******************************************************************************/
%%
/******************************************************************************
* BEGIN USER SUBROUTINES
******************************************************************************/
/*
** Define the yywrap to be an empty function
*/
int yywrap(void) { return 1; }
/*
** This function checks if the given character is possibly the beginning of a
** dependent vowel
*/
int isVowel(int c) {
int d;
switch (c) {
case 'a':
case 'A':
case 'i':
case 'I':
case 'u':
case 'U':
case 'e':
case 'E':
case 'o':
case 'O':
return 1;
case '.':
/* We need further look ahead */
d = input();
if (d >= 0) {
unput(d); // Undo the lookahead
switch(d) {
case 'r':
case 'R':
case 'l':
case 'L':
return 1;
default:
return 0;
}
}
/* Fall through */
default:
return 0;
}
}
/*
** This function checks if the following character is the beginning of a
** consonant
*/
int isConsonant(int c) {
int d;
switch(c) {
case '`':
case 'w':
case '\'':
case 'k':
case 'g':
case 'C':
case 'j':
case 'T':
case 'D':
case 'N':
case 't':
case 'd':
case 'n':
case 'p':
case 'b':
case 'm':
case 'y':
case 'r':
case 'l':
case 'L':
case 'v':
case 'S':
case 'X':
case 's':
case 'h':
case 'H': /* As 'H' is an implicit consonant */
case 'q':
case 'K':
case 'G':
case 'z':
case 'Z':
case 'R':
case 'f':
return 1;
case '.':
/* We need further look ahead */
d = input();
if (d >= 0) {
unput(d); // Undo the lookahead
switch(d) {
case 'N':
case 'a':
return 1;
default:
return 0;
}
}
/* Fall through */
default:
return 0;
}
}
/*
** This function adds a Hamza (') if the following character(s) is a vowel only
** if the transliterate flag is set
*/
void addHamza() {
if (!transliterate) return;
int c = input(); // Attempt Lookahead
if (c >= 0) {
/* Note that the unput(c) cannot be moved out of the if block
This is because the isVowel() is going to do lookahead too! */
if (isVowel(c)) {
unput(c); // Undo the lookahead
unput('\'');
} else {
unput(c); // Undo the lookahead
}
}
}
/*
** This function adds a virAm (.h) if the following character(s) is not a vowel.
** Note that a Zero-Width Non Joiner may also be inserted. All this only if the
** transliterate flag is set
*/
void addViram() {
if (!transliterate) return;
// If we are in a noViram zone, just return
if (noViram) return;;
int c = input(); // Attempt Lookahead
if (c >= 0) {
/* Note that the unput(c) cannot be moved above the if block
This is because the isVowel() is going to do lookahead too! */
if (!isVowel(c)) {
/* It could be a ".d" in which case the VIRAM decision is
delayed to beyond the ".d" */
if (c == '.') {
int d = input();
if (d >= 0) {
unput(d);
if (d == 'd') {
unput(c);
return;
}
}
}
printf("%s",VIRAM);
/* Now check if the char c was a 'H'
If not, put the ZWNJ */
if (c != 'H') {
/* Put the ZWNJ only if the flag is set */
if (useNonJoiner) printf("%s",ZWNJ);
unput(c); // Undo the lookahead
} else {
/* Replace this now solitary 'H' by 'h' */
unput('h');
}
} else {
unput(c); // Undo the lookahead
}
} else {
printf("%s",VIRAM); // What else can we do?
}
}
/*
** This functions adds a .s (presumably after a long vowel) to indicate a vowel
** grade reduction. If a .n follows the vowel, the .s is added after the .n
*/
void reduceVowel() {
int dot = input(); // Look ahead for a .n
if (dot >= 0) {
if (dot == '.') {
int n = input();
if (n >=0){
if (n == 'n') {
unput('s');
unput('.');
unput(n);
unput(dot);
return;
}
unput(n);
}
}
unput(dot);
}
unput('s');
unput('.');
}
/*
** This function corrects the .N to one of .N^k, .N^j, N, n, and m depending
** on the following consonant
*/
void correctAnusvar() {
if (!transliterate) return;
int c = input(); // Look ahead for the following consonant;
unput(c); // Restore lookahead
switch (c) {
case 'k':
case 'g':
/* Replace by .N^k */
unput('k');
unput('^');
unput('N');
unput('.');
break;
case 'C':
case 'j':
/* Replace by .N^j */
unput('j');
unput('^');
unput('N');
unput('.');
break;
case 't':
case 'd':
case 'n':
/* Replace by n */
unput('n');
break;
case 'T':
case 'D':
case 'N':
/* Replace by N */
unput('N');
break;
case 'p':
case 'b':
case 'm':
/* Replace by m */
unput('m');
break;
}
}
/*
** This function decides if we should retain the 'a' that was scanned already
** or not. The 'a' is removed if the following character is a consonant or '.s'
** or '.h' or '. ' or generic delimiters. Alternatively, it is retained if we
** see a vowel or a '.d'. Note, this may need to be modified based upon future
** additions since this behavior is quite arbitrary. Retaining the 'a' is only
** a way of flagging potential bugs in the input text
*/
void adjustDanda() {
int c = input();
if (c >= 0) {
if (isVowel(c)) {
ECHO; // Retain the 'a' in the output
} else if (c == '.') {
int d = input(); // Lookahead for a d
if (d >= 0) {
unput(d);
if (d == 'd') {
ECHO; // Retain the 'a' in the output
}
}
}
unput(c);
}
}
/*
** This function prints the incoming string to stdout if the transliterate flag
** is set. Else just send the scanned token as is to stdout
*/
void encode(char* utfValue) {
transliterate ? printf("%s",utfValue) : ECHO;
}
/*
** The convh2s is a function deliberately PUT here for backward compatibility
** purposes (so that my UCS scheme is compatible with my Jtrans scheme)
** It replaces any .h following a vowel to a .s
**
** The Above was the ORIGINAL intention of using this function. Currently,
** though this is modified to just ignore any .h following a vowel.
*/
void convh2s() {
if (!transliterate) return;
int dot = input();
int h;
if (dot >= 0) {
if (dot == '.') {
/* Look for a further 'h' */
h = input();
if (h >= 0) {
if (h == 'h') {
return;
#if 0
unput('s');
#endif
} else {
unput(h);
}
}
}
unput(dot);
}
}
/******************************************************************************
* END USER SUBROUTINES
******************************************************************************/
               (
geocities.com/roshbaby/UCS)                   (
geocities.com/roshbaby)