Package pyarabic :: Module arabicchars'
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.arabicchars'

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  #--- 
  4  # $Id: arabic.py,v 1.6 2003/04/22 17:18:22 elzubeir Exp $ 
  5  # 
  6  # ------------ 
  7  # Description: 
  8  # ------------ 
  9  # 
 10  # Arabic codes 
 11  # 
 12  # (C) Copyright 2003, Arabeyes, Mohammed Elzubeir 
 13  # ----------------- 
 14  # Revision Details:    (Updated by Revision Control System) 
 15  # ----------------- 
 16  #  $Date: 2003/04/22 17:18:22 $ 
 17  #  $Author: elzubeir $ 
 18  #  $Revision: 1.6 $ 
 19  #  $Source: /home/arabeyes/cvs/projects/duali/pyduali/pyduali/arabic.py,v $ 
 20  # 
 21  #  This program is written under the BSD License. 
 22  #--- 
 23  """ 
 24  Arabic module 
 25  """ 
 26  import re 
27 -class arabicchars:
28 """ 29 the arabic chars contains all arabic letters, a sub class of unicode, 30 """ 31 32 COMMA = u'\u060C' 33 SEMICOLON = u'\u061B' 34 QUESTION = u'\u061F' 35 HAMZA = u'\u0621' 36 ALEF_MADDA = u'\u0622' 37 ALEF_HAMZA_ABOVE = u'\u0623' 38 WAW_HAMZA = u'\u0624' 39 ALEF_HAMZA_BELOW = u'\u0625' 40 YEH_HAMZA = u'\u0626' 41 ALEF = u'\u0627' 42 BEH = u'\u0628' 43 TEH_MARBUTA = u'\u0629' 44 TEH = u'\u062a' 45 THEH = u'\u062b' 46 JEEM = u'\u062c' 47 HAH = u'\u062d' 48 KHAH = u'\u062e' 49 DAL = u'\u062f' 50 THAL = u'\u0630' 51 REH = u'\u0631' 52 ZAIN = u'\u0632' 53 SEEN = u'\u0633' 54 SHEEN = u'\u0634' 55 SAD = u'\u0635' 56 DAD = u'\u0636' 57 TAH = u'\u0637' 58 ZAH = u'\u0638' 59 AIN = u'\u0639' 60 GHAIN = u'\u063a' 61 TATWEEL = u'\u0640' 62 FEH = u'\u0641' 63 QAF = u'\u0642' 64 KAF = u'\u0643' 65 LAM = u'\u0644' 66 MEEM = u'\u0645' 67 NOON = u'\u0646' 68 HEH = u'\u0647' 69 WAW = u'\u0648' 70 ALEF_MAKSURA = u'\u0649' 71 YEH = u'\u064a' 72 MADDA_ABOVE = u'\u0653' 73 HAMZA_ABOVE = u'\u0654' 74 HAMZA_BELOW = u'\u0655' 75 ZERO = u'\u0660' 76 ONE = u'\u0661' 77 TWO = u'\u0662' 78 THREE = u'\u0663' 79 FOUR = u'\u0664' 80 FIVE = u'\u0665' 81 SIX = u'\u0666' 82 SEVEN = u'\u0667' 83 EIGHT = u'\u0668' 84 NINE = u'\u0669' 85 PERCENT = u'\u066a' 86 DECIMAL = u'\u066b' 87 THOUSANDS = u'\u066c' 88 STAR = u'\u066d' 89 MINI_ALEF = u'\u0670' 90 ALEF_WASLA = u'\u0671' 91 FULL_STOP = u'\u06d4' 92 BYTE_ORDER_MARK = u'\ufeff' 93 94 # Diacritics 95 FATHATAN = u'\u064b' 96 DAMMATAN = u'\u064c' 97 KASRATAN = u'\u064d' 98 FATHA = u'\u064e' 99 DAMMA = u'\u064f' 100 KASRA = u'\u0650' 101 SHADDA = u'\u0651' 102 SUKUN = u'\u0652' 103 104 # Small Letters 105 SMALL_ALEF=u"\u0670" 106 SMALL_WAW=u"\u06E5" 107 SMALL_YEH=u"\u06E6" 108 #Ligatures 109 LAM_ALEF=u'\ufefb' 110 LAM_ALEF_HAMZA_ABOVE=u'\ufef7' 111 LAM_ALEF_HAMZA_BELOW=u'\ufef9' 112 LAM_ALEF_MADDA_ABOVE=u'\ufef5' 113 simple_LAM_ALEF=u'\u0644\u0627' 114 simple_LAM_ALEF_HAMZA_ABOVE=u'\u0644\u0623' 115 simple_LAM_ALEF_HAMZA_BELOW=u'\u0644\u0625' 116 simple_LAM_ALEF_MADDA_ABOVE=u'\u0644\u0622' 117 # groups 118 TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN, 119 FATHA,DAMMA,KASRA, 120 SUKUN, 121 SHADDA); 122 HARAKAT =(FATHATAN,DAMMATAN,KASRATAN, 123 FATHA,DAMMA,KASRA, 124 SUKUN 125 ); 126 SHORTHARAKAT =( FATHA,DAMMA,KASRA, SUKUN); 127 128 TANWIN =(FATHATAN,DAMMATAN,KASRATAN); 129 130 SHORTHARAKAT =( FATHA,DAMMA,KASRA, SUKUN); 131 132 LIGUATURES=( 133 LAM_ALEF, 134 LAM_ALEF_HAMZA_ABOVE, 135 LAM_ALEF_HAMZA_BELOW, 136 LAM_ALEF_MADDA_ABOVE, 137 ); 138 HAMZAT=( 139 HAMZA, 140 WAW_HAMZA, 141 YEH_HAMZA, 142 HAMZA_ABOVE, 143 HAMZA_BELOW, 144 ALEF_HAMZA_BELOW, 145 ALEF_HAMZA_ABOVE, 146 ); 147 ALEFAT=( 148 ALEF, 149 ALEF_MADDA, 150 ALEF_HAMZA_ABOVE, 151 ALEF_HAMZA_BELOW, 152 ALEF_WASLA, 153 ALEF_MAKSURA, 154 SMALL_ALEF, 155 156 ); 157 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA); 158 YEHLIKE= ( 159 YEH, 160 YEH_HAMZA, 161 ALEF_MAKSURA, 162 SMALL_YEH, 163 ); 164 165 WAWLIKE=( 166 WAW, 167 WAW_HAMZA, 168 SMALL_WAW, 169 ); 170 TEHLIKE=( 171 TEH, 172 TEH_MARBUTA, 173 ); 174 MOON=( 175 TEH, 176 TEH_MARBUTA, 177 ); 178 SMALL=( 179 TEH, 180 TEH_MARBUTA, 181 ); 182 MOON=( 183 TEH, 184 TEH_MARBUTA, 185 ); 186 SUN=( 187 TEH, 188 TEH_MARBUTA, 189 ); 190 191 HARAKAT =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) ) 192 193 HARAKAT_NO_SHADDA_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) ) 194 195 # regular expretion 196 HARAKAT_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) ) 197 TASHKEEL_pat =re.compile(ur"^[%s%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA) ) 198 199 HARAKAT_NO_SHADDA_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) ) 200
201 - def __init__():
202 pass;
203 - def isArabicstring(self,text):
204 """ Checks for an Arabic Unicode block characters; 205 @param text: input text 206 @type text: unicode 207 @return: True if all charaters are in Arabic block 208 @rtype: Boolean 209 """ 210 pass; 211 if len(word)==0: return False; 212 word_nm=ar_strip_marks_keepshadda(word); 213 # the alef_madda is considered as 2 letters 214 word_nm=word_nm.replace(ALEF_MADDA,HAMZA+ALEF); 215 if word[0] in (WAW_HAMZA,YEH_HAMZA,FATHA,DAMMA,SUKUN,KASRA): 216 return False; 217 # إذا كانت الألف المقصورة في غير آخر الفعل 218 if re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word): 219 return False; 220 if re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word): 221 return False; 222 ## i=0; 223 224 if re.search(u"([^\u0621-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word): 225 return False; 226 if re.match(u"([\d])+",word): 227 return False; 228 return True;
229
230 - def isArabicword(self,word):
231 """ Checks for an valid Arabic word. 232 An Arabic word 233 @param word: input word 234 @type word: unicode 235 @return: True if all charaters are in Arabic block 236 @rtype: Boolean 237 """ 238 pass;
239 ################################################ 240 #{ is letter functions 241 ################################################
242 - def isSukun(self,archar):
243 """Checks for Arabic Sukun Mark. 244 @param archar: arabic unicode char 245 @type archar: unicode 246 """ 247 if archar==self.SUKUN: 248 return True; 249 else: return False;
250
251 - def isShadda(self,archar):
252 """Checks for Arabic Shadda Mark. 253 @param archar: arabic unicode char 254 @type archar: unicode 255 """ 256 if archar==self.SHADDA: 257 return True; 258 else: return False;
259
260 - def isTatweel(self,archar):
261 """Checks for Arabic Tatweel letter modifier. 262 @param archar: arabic unicode char 263 @type archar: unicode 264 """ 265 if archar==self.TATWEEL: 266 return True; 267 else: return False;
268 - def isTanwin(self,archar):
269 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN). 270 @param archar: arabic unicode char 271 @type archar: unicode 272 """ 273 if archar in self.TANWIN: 274 return True; 275 else: return False;
276
277 - def isTashkeel(self,archar):
278 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn). 279 @param archar: arabic unicode char 280 @type archar: unicode 281 """ 282 ## if re.search(self.TASHKEEL,word): 283 if archar in self.TASHKEEL: 284 return True; 285 else: return False;
286
287 - def isHaraka(self,archar):
288 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN). 289 @param archar: arabic unicode char 290 @type archar: unicode 291 """ 292 if archar in self.HARAKAT: 293 return True; 294 else: return False;
295
296 - def isShortharaka(self,archar):
297 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN). 298 @param archar: arabic unicode char 299 @type archar: unicode 300 """ 301 if archar in self.SHORTHARAKAT: 302 return True; 303 else: return False;
304
305 - def isLigature(self,archar):
306 """Checks for Arabic Ligatures like LamAlef. 307 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) 308 @param archar: arabic unicode char 309 @type archar: unicode 310 """ 311 if archar in self.LIGUATURES: 312 return True; 313 else: return False;
314
315 - def isHamza(self,archar):
316 """Checks for Arabic Hamza forms. 317 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE ) 318 @param archar: arabic unicode char 319 @type archar: unicode 320 """ 321 if archar in self.HAMZAT: 322 return True; 323 else: return False;
324
325 - def isAlef(self,archar):
326 """Checks for Arabic Alef forms. 327 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA ); 328 @param archar: arabic unicode char 329 @type archar: unicode 330 """ 331 if archar in self.ALEFAT: 332 return True; 333 else: return False;
334
335 - def isYehlike(self,archar):
336 """Checks for Arabic Yeh forms. 337 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA 338 @param archar: arabic unicode char 339 @type archar: unicode 340 """ 341 if archar in self.YEHLIKE: 342 return True; 343 else: return False;
344
345 - def isWawlike(self,archar):
346 """Checks for Arabic Waw like forms. 347 Waw forms : WAW, WAW_HAMZA, SMALL_WAW 348 @param archar: arabic unicode char 349 @type archar: unicode 350 """ 351 if archar in self.WAWLIKE: 352 return True; 353 else: return False;
354
355 - def isTeh(self,archar):
356 """Checks for Arabic Teh forms. 357 Teh forms : TEH, TEH_MARBUTA 358 @param archar: arabic unicode char 359 @type archar: unicode 360 """ 361 if archar in self.TEHLIKE: 362 return True; 363 else: return False;
364 - def isSmall(self,archar):
365 """Checks for Arabic Small letters. 366 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH 367 @param archar: arabic unicode char 368 @type archar: unicode 369 """ 370 if archar in self.SMALL: 371 return True; 372 else: return False;
373
374 - def isWeak(self,archar):
375 """Checks for Arabic Weak letters. 376 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA 377 @param archar: arabic unicode char 378 @type archar: unicode 379 """ 380 if archar in self.WEAK: 381 return True; 382 else: return False;
383 384 385 ##################################### 386 #{ Has letter functions 387 #####################################
388 - def hassHadda(self,word):
389 """Checks if the arabic word contains shadda. 390 @param word: arabic unicode char 391 @type word: unicode 392 """ 393 if re.search(self.SHADDA,word): 394 return True; 395 else: 396 return False;
397
398 - def hasHaraka(self,word):
399 """Checks if the arabic word contains harakat ( FATHA, DAMMA, KASRA,. 400 @param word: arabic unicode char 401 @type word: unicode 402 """ 403 if re.search(self.HARAKAT_pat,word): 404 return True; 405 else: 406 return False;
407 ##################################### 408 #{ word and text functions 409 #####################################
410 - def isVocalized(self,word):
411 """Checks if the arabic word is vocalized. 412 the word musn't have any spaces and pounctuations. 413 @param word: arabic unicode char 414 @type word: unicode 415 """ 416 if word.isalpha(): return False; 417 # n (self.FATHA,self.DAMMAN,self.KASRA): 418 else: 419 if re.search(self.HARAKAT_pat,word): 420 return True; 421 else: 422 return False;
423 - def isVocalizedtext(self,text):
424 """Checks if the arabic text is vocalized. 425 The text can contain many words and spaces 426 @param text: arabic unicode char 427 @type text: unicode 428 """ 429 if re.search(self.HARAKAT_pat,word): 430 return True; 431 else: 432 return False;
433 ##################################### 434 #{Strip functions 435 #####################################
436 - def stripHarakat(self,word):
437 """Strip Harakat from arabic word except Shadda. 438 Harakat doesn't contain Shdda. 439 to strip all Harakat and Shadda, use stripTashkeel function. 440 @param word: arabic unicode char 441 @type word: unicode 442 """ 443 return re.sub(self.HARAKAT_pat,u'',word)
444
445 - def stripTashkeel(self,word):
446 """Strip Tashkeel from arabic word. 447 Tashkeel contains (Harakat and Shadda) 448 to strip all Harakat and Shadda, use stripTashkeel function. 449 @param word: arabic unicode char 450 @type word: unicode 451 """ 452 return re.sub(self.TASHKEEL_pat,'',word);
453
454 - def stripTatweel(self,word):
455 """Strip Tatweel (Kashida) from arabic word. 456 @param word: arabic unicode char 457 @type word: unicode 458 """ 459 return re.sub(self.TATWEEL,'',word);
460