Package pyarabic :: Module arabicchars
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.arabicchars

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  #--- 
  4  # 
  5  # ------------ 
  6  # Description: 
  7  # ------------ 
  8  # 
  9  # Arabic codes 
 10  # 
 11  # (C) Copyright 2010, Taha Zerrouki 
 12  # ----------------- 
 13  #  $Date: 2010/03/01 
 14  #  $Author: Taha Zerrouki$ 
 15  #  $Revision: 0.1 $ 
 16  #  This program is written under the Gnu Public License. 
 17  # 
 18  """ 
 19  Arabic module 
 20  """ 
 21  import re 
22 -class arabicchars:
23 """ 24 the arabic chars contains all arabic letters, a sub class of unicode, 25 """ 26 27 COMMA = u'\u060C' 28 SEMICOLON = u'\u061B' 29 QUESTION = u'\u061F' 30 HAMZA = u'\u0621' 31 ALEF_MADDA = u'\u0622' 32 ALEF_HAMZA_ABOVE = u'\u0623' 33 WAW_HAMZA = u'\u0624' 34 ALEF_HAMZA_BELOW = u'\u0625' 35 YEH_HAMZA = u'\u0626' 36 ALEF = u'\u0627' 37 BEH = u'\u0628' 38 TEH_MARBUTA = u'\u0629' 39 TEH = u'\u062a' 40 THEH = u'\u062b' 41 JEEM = u'\u062c' 42 HAH = u'\u062d' 43 KHAH = u'\u062e' 44 DAL = u'\u062f' 45 THAL = u'\u0630' 46 REH = u'\u0631' 47 ZAIN = u'\u0632' 48 SEEN = u'\u0633' 49 SHEEN = u'\u0634' 50 SAD = u'\u0635' 51 DAD = u'\u0636' 52 TAH = u'\u0637' 53 ZAH = u'\u0638' 54 AIN = u'\u0639' 55 GHAIN = u'\u063a' 56 TATWEEL = u'\u0640' 57 FEH = u'\u0641' 58 QAF = u'\u0642' 59 KAF = u'\u0643' 60 LAM = u'\u0644' 61 MEEM = u'\u0645' 62 NOON = u'\u0646' 63 HEH = u'\u0647' 64 WAW = u'\u0648' 65 ALEF_MAKSURA = u'\u0649' 66 YEH = u'\u064a' 67 MADDA_ABOVE = u'\u0653' 68 HAMZA_ABOVE = u'\u0654' 69 HAMZA_BELOW = u'\u0655' 70 ZERO = u'\u0660' 71 ONE = u'\u0661' 72 TWO = u'\u0662' 73 THREE = u'\u0663' 74 FOUR = u'\u0664' 75 FIVE = u'\u0665' 76 SIX = u'\u0666' 77 SEVEN = u'\u0667' 78 EIGHT = u'\u0668' 79 NINE = u'\u0669' 80 PERCENT = u'\u066a' 81 DECIMAL = u'\u066b' 82 THOUSANDS = u'\u066c' 83 STAR = u'\u066d' 84 MINI_ALEF = u'\u0670' 85 ALEF_WASLA = u'\u0671' 86 FULL_STOP = u'\u06d4' 87 BYTE_ORDER_MARK = u'\ufeff' 88 89 # Diacritics 90 FATHATAN = u'\u064b' 91 DAMMATAN = u'\u064c' 92 KASRATAN = u'\u064d' 93 FATHA = u'\u064e' 94 DAMMA = u'\u064f' 95 KASRA = u'\u0650' 96 SHADDA = u'\u0651' 97 SUKUN = u'\u0652' 98 99 # Small Letters 100 SMALL_ALEF =u"\u0670" 101 SMALL_WAW =u"\u06E5" 102 SMALL_YEH =u"\u06E6" 103 #Ligatures 104 LAM_ALEF =u'\ufefb' 105 LAM_ALEF_HAMZA_ABOVE =u'\ufef7' 106 LAM_ALEF_HAMZA_BELOW =u'\ufef9' 107 LAM_ALEF_MADDA_ABOVE =u'\ufef5' 108 simple_LAM_ALEF =u'\u0644\u0627' 109 simple_LAM_ALEF_HAMZA_ABOVE =u'\u0644\u0623' 110 simple_LAM_ALEF_HAMZA_BELOW =u'\u0644\u0625' 111 simple_LAM_ALEF_MADDA_ABOVE =u'\u0644\u0622' 112 # groups 113 LETTERS=u''.join([ 114 ALEF , BEH , TEH , TEH_MARBUTA , THEH , JEEM , HAH , KHAH , 115 DAL , THAL , REH , ZAIN , SEEN , SHEEN , SAD , DAD , TAH , ZAH , 116 AIN , GHAIN , FEH , QAF , KAF , LAM , MEEM , NOON, HEH , WAW, YEH , 117 HAMZA , ALEF_MADDA , ALEF_HAMZA_ABOVE , WAW_HAMZA , ALEF_HAMZA_BELOW , YEH_HAMZA , 118 ]) 119 120 TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN, 121 FATHA,DAMMA,KASRA, 122 SUKUN, 123 SHADDA); 124 HARAKAT =( FATHATAN, DAMMATAN, KASRATAN, 125 FATHA, DAMMA, KASRA, 126 SUKUN 127 ); 128 SHORTHARAKAT =( FATHA, DAMMA, KASRA, SUKUN); 129 130 TANWIN =(FATHATAN, DAMMATAN, KASRATAN); 131 132 133 LIGUATURES=( 134 LAM_ALEF, 135 LAM_ALEF_HAMZA_ABOVE, 136 LAM_ALEF_HAMZA_BELOW, 137 LAM_ALEF_MADDA_ABOVE, 138 ); 139 HAMZAT=( 140 HAMZA, 141 WAW_HAMZA, 142 YEH_HAMZA, 143 HAMZA_ABOVE, 144 HAMZA_BELOW, 145 ALEF_HAMZA_BELOW, 146 ALEF_HAMZA_ABOVE, 147 ); 148 ALEFAT=( 149 ALEF, 150 ALEF_MADDA, 151 ALEF_HAMZA_ABOVE, 152 ALEF_HAMZA_BELOW, 153 ALEF_WASLA, 154 ALEF_MAKSURA, 155 SMALL_ALEF, 156 157 ); 158 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA); 159 YEHLIKE= ( YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH ); 160 161 WAWLIKE = ( WAW, WAW_HAMZA, SMALL_WAW ); 162 TEHLIKE = ( TEH, TEH_MARBUTA ); 163 164 SMALL =( SAMLL_ALEF, SMALL_WAW, SMALL_YEH) 165 MOON=( 166 HAMZA , 167 ALEF_MADDA , 168 ALEF_HAMZA_ABOVE , 169 ALEF_HAMZA_BELOW , 170 ALEF , 171 BEH , 172 JEEM , 173 HAH , 174 KHAH , 175 AIN , 176 GHAIN , 177 FEH , 178 QAF , 179 KAF , 180 MEEM , 181 HEH , 182 WAW , 183 YEH 184 ), 185 SUN=( 186 TEH , 187 THEH , 188 DAL , 189 THAL , 190 REH , 191 ZAIN , 192 SEEN , 193 SHEEN , 194 SAD , 195 DAD , 196 TAH , 197 ZAH , 198 LAM , 199 NOON , 200 ); 201 AlphabeticOrder={ 202 ALEF : 1, 203 BEH : 2, 204 TEH : 3, 205 TEH_MARBUTA : 3, 206 THEH : 4, 207 JEEM : 5, 208 HAH : 6, 209 KHAH : 7, 210 DAL : 8, 211 THAL : 9, 212 REH : 10, 213 ZAIN : 11, 214 SEEN : 12, 215 SHEEN : 13, 216 SAD : 14, 217 DAD : 15, 218 TAH : 16, 219 ZAH : 17, 220 AIN : 18, 221 GHAIN : 19, 222 FEH : 20, 223 QAF : 21, 224 KAF : 22, 225 LAM : 23, 226 MEEM : 24, 227 NOON : 25, 228 HEH : 26, 229 WAW : 27, 230 YEH : 28, 231 HAMZA : 29, 232 233 ALEF_MADDA : 29, 234 ALEF_HAMZA_ABOVE : 29, 235 WAW_HAMZA : 29, 236 ALEF_HAMZA_BELOW : 29, 237 YEH_HAMZA : 29, 238 } 239 NAMES -{ 240 ALEF : u"ألف", 241 BEH : u"باء", 242 TEH : u'تاء' , 243 TEH_MARBUTA : u'تاء مربوطة' , 244 THEH : u'ثاء' , 245 JEEM : u'جيم' , 246 HAH : u'حاء' , 247 KHAH : u'خاء' , 248 DAL : u'دال' , 249 THAL : u'ذال' , 250 REH : u'راء' , 251 ZAIN : u'زاي' , 252 SEEN : u'سين' , 253 SHEEN : u'شين' , 254 SAD : u'صاد' , 255 DAD : u'ضاد' , 256 TAH : u'طاء' , 257 ZAH : u'ظاء' , 258 AIN : u'عين' , 259 GHAIN : u'غين' , 260 FEH : u'فاء' , 261 QAF : u'قاف' , 262 KAF : u'كاف' , 263 LAM : u'لام' , 264 MEEM : u'ميم' , 265 NOON : u'نون' , 266 HEH : u'هاء' , 267 WAW : u'واو' , 268 YEH : u'ياء' , 269 HAMZA : u'همزة' , 270 271 ALEF_MADDA : u'ألف ممدودة' , 272 ALEF_HAMZA_ABOVE : u'همزة على الألف' , 273 WAW_HAMZA : u'همزة على الواو' , 274 ALEF_HAMZA_BELOW : u'همزة تحت الألف' , 275 YEH_HAMZA : u'همزة على الياء' , 276 FATHATAN : u'فتحتان', 277 DAMMATAN : u'ضمتان', 278 KASRATAN : u'كسرتان', 279 FATHA : u'فتحة', 280 DAMMA : u'ضمة', 281 KASRA : u'كسرة', 282 SHADDA : u'شدة', 283 SUKUN : u'سكون', 284 } 285 286 # regular expretion 287 HARAKAT_pattern =re.compile(ur"["+u"".join(HARAKAT)+u"]") 288 HARAKAT_pattern =re.compile(ur"["+u"".join(TASHKEEL)+u"]") 289 HAMZAT_pattern =re.compile(ur"["+u"".join(HAMZAT)+u"]"); 290 ALEFAT_pattern =re.compile(ur"["+u"".join(ALEFAT)+u"]"); 291 LIGUATURES_pattern =re.compile(ur"["+u"".join(LIGATURES)+u"]"); 292
293 - def __init__():
294 pass;
295 296 ################################################ 297 #{ is letter functions 298 ################################################
299 - def isSukun(self,archar):
300 """Checks for Arabic Sukun Mark. 301 @param archar: arabic unicode char 302 @type archar: unicode 303 """ 304 if archar==self.SUKUN: 305 return True; 306 else: return False;
307
308 - def isShadda(self,archar):
309 """Checks for Arabic Shadda Mark. 310 @param archar: arabic unicode char 311 @type archar: unicode 312 """ 313 if archar==self.SHADDA: 314 return True; 315 else: return False;
316
317 - def isTatweel(self,archar):
318 """Checks for Arabic Tatweel letter modifier. 319 @param archar: arabic unicode char 320 @type archar: unicode 321 """ 322 if archar==self.TATWEEL: 323 return True; 324 else: return False;
325 - def isTanwin(self,archar):
326 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN). 327 @param archar: arabic unicode char 328 @type archar: unicode 329 """ 330 if archar in self.TANWIN: 331 return True; 332 else: return False;
333
334 - def isTashkeel(self,archar):
335 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn). 336 @param archar: arabic unicode char 337 @type archar: unicode 338 """ 339 ## if re.search(self.TASHKEEL,word): 340 if archar in self.TASHKEEL: 341 return True; 342 else: return False;
343
344 - def isHaraka(self,archar):
345 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN). 346 @param archar: arabic unicode char 347 @type archar: unicode 348 """ 349 if archar in self.HARAKAT: 350 return True; 351 else: return False;
352
353 - def isShortharaka(self,archar):
354 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN). 355 @param archar: arabic unicode char 356 @type archar: unicode 357 """ 358 if archar in self.SHORTHARAKAT: 359 return True; 360 else: return False;
361
362 - def isLigature(self,archar):
363 """Checks for Arabic Ligatures like LamAlef. 364 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) 365 @param archar: arabic unicode char 366 @type archar: unicode 367 """ 368 if archar in self.LIGUATURES: 369 return True; 370 else: return False;
371
372 - def isHamza(self,archar):
373 """Checks for Arabic Hamza forms. 374 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE ) 375 @param archar: arabic unicode char 376 @type archar: unicode 377 """ 378 if archar in self.HAMZAT: 379 return True; 380 else: return False;
381
382 - def isAlef(self,archar):
383 """Checks for Arabic Alef forms. 384 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA ); 385 @param archar: arabic unicode char 386 @type archar: unicode 387 """ 388 if archar in self.ALEFAT: 389 return True; 390 else: return False;
391
392 - def isYehlike(self,archar):
393 """Checks for Arabic Yeh forms. 394 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA 395 @param archar: arabic unicode char 396 @type archar: unicode 397 """ 398 if archar in self.YEHLIKE: 399 return True; 400 else: return False;
401
402 - def isWawlike(self,archar):
403 """Checks for Arabic Waw like forms. 404 Waw forms : WAW, WAW_HAMZA, SMALL_WAW 405 @param archar: arabic unicode char 406 @type archar: unicode 407 """ 408 if archar in self.WAWLIKE: 409 return True; 410 else: return False;
411
412 - def isTeh(self,archar):
413 """Checks for Arabic Teh forms. 414 Teh forms : TEH, TEH_MARBUTA 415 @param archar: arabic unicode char 416 @type archar: unicode 417 """ 418 if archar in self.TEHLIKE: 419 return True; 420 else: return False;
421 - def isSmall(self,archar):
422 """Checks for Arabic Small letters. 423 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH 424 @param archar: arabic unicode char 425 @type archar: unicode 426 """ 427 if archar in self.SMALL: 428 return True; 429 else: return False;
430
431 - def isWeak(self,archar):
432 """Checks for Arabic Weak letters. 433 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA 434 @param archar: arabic unicode char 435 @type archar: unicode 436 """ 437 if archar in self.WEAK: 438 return True; 439 else: return False;
440
441 - def isMoon(self,archar):
442 """Checks for Arabic Moon letters. 443 Moon Letters : 444 @param archar: arabic unicode char 445 @type archar: unicode 446 """ 447 if archar in self.MOON: 448 return True; 449 else: return False;
450
451 - def isSun(self,archar):
452 """Checks for Arabic Sun letters. 453 Moon Letters : 454 @param archar: arabic unicode char 455 @type archar: unicode 456 """ 457 if archar in self.SUN: 458 return True; 459 else: return False;
460 ##################################### 461 #{ general letter functions 462 #####################################
463 - def order(self,archar):
464 """return Arabic letter order between 1 and 29. 465 Alef order is 1, Yeh is 28, Hamza is 29. 466 Teh Marbuta has the same ordre with Teh, 3. 467 @param archar: arabic unicode char 468 @type archar: unicode 469 @return: arabic order. 470 @rtype: integer; 471 """ 472 if self.AlphabeticOrder.has_key(archar): 473 return self.ArabicOrder[archar]; 474 else: return 0;
475
476 - def name(self,archar):
477 """return Arabic letter name in arabic. 478 Alef order is 1, Yeh is 28, Hamza is 29. 479 Teh Marbuta has the same ordre with Teh, 3. 480 @param archar: arabic unicode char 481 @type archar: unicode 482 @return: arabic name. 483 @rtype: unicode; 484 """ 485 if self.NAMES.has_key(archar): 486 return self.ArabicOrder[archar]; 487 else: return 0;
488
489 - def range(self):
490 """return a list of arabic characteres . 491 Return a list of characteres between \u060c to \u0652 492 @return: list of arabic characteres. 493 @rtype: unicode; 494 """ 495 myslist=[]; 496 for i in range(0x0600, 0x00652): 497 try : 498 mylist.append=unichr(i); 499 except ValueError: 500 pass; 501 return mylist;
502 503 504 ##################################### 505 #{ Has letter functions 506 #####################################
507 - def hasShadda(self,word):
508 """Checks if the arabic word contains shadda. 509 @param word: arabic unicode char 510 @type word: unicode 511 """ 512 if re.search(self.SHADDA,word): 513 return True; 514 else: 515 return False;
516 517 ##################################### 518 #{ word and text functions 519 #####################################
520 - def isVocalized(self,word):
521 """Checks if the arabic word is vocalized. 522 the word musn't have any spaces and pounctuations. 523 @param word: arabic unicode char 524 @type word: unicode 525 """ 526 if word.isalpha(): return False; 527 # n (self.FATHA,self.DAMMAN,self.KASRA): 528 else: 529 if re.search(self.HARAKAT_pattern,word): 530 return True; 531 else: 532 return False;
533 - def isVocalizedtext(self,text):
534 """Checks if the arabic text is vocalized. 535 The text can contain many words and spaces 536 @param text: arabic unicode char 537 @type text: unicode 538 """ 539 if re.search(self.HARAKAT_pattern,word): 540 return True; 541 else: 542 return False;
543 - def isArabicstring(self,text):
544 """ Checks for an Arabic Unicode block characters; 545 @param text: input text 546 @type text: unicode 547 @return: True if all charaters are in Arabic block 548 @rtype: Boolean 549 """ 550 pass; 551 if len(word)==0: return False; 552 word_nm=ar_strip_marks_keepshadda(word); 553 # the alef_madda is considered as 2 letters 554 word_nm=word_nm.replace(ALEF_MADDA,HAMZA+ALEF); 555 if word[0] in (WAW_HAMZA,YEH_HAMZA,FATHA,DAMMA,SUKUN,KASRA): 556 return False; 557 # إذا كانت الألف المقصورة في غير آخر الفعل 558 if re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word): 559 return False; 560 if re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word): 561 return False; 562 ## i=0; 563 564 if re.search(u"([^\u0621-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word): 565 return False; 566 if re.match(u"([\d])+",word): 567 return False; 568 return True;
569
570 - def isArabicword(self,word):
571 """ Checks for an valid Arabic word. 572 An Arabic word 573 @param word: input word 574 @type word: unicode 575 @return: True if all charaters are in Arabic block 576 @rtype: Boolean 577 """ 578 pass;
579 ##################################### 580 #{Strip functions 581 #####################################
582 - def stripHarakat(self,text):
583 """Strip Harakat from arabic word except Shadda. 584 The striped marks are : 585 - FATHA, DAMMA, KASRA 586 - SUKUN 587 - FATHATAN, DAMMATAN, KASRATAN, , , . 588 Example: 589 >>> text=u"الْعَرَبِيّةُ" 590 >>> stripTashkeel(text) 591 العربيّة 592 593 @param text: arabic text. 594 @type text: unicode. 595 @return: return a striped text. 596 @rtype: unicode. 597 """ 598 return re.sub(self.HARAKAT_pattern,u'',text)
599
600 - def stripTashkeel(self,text):
601 """Strip vowels from a text, include Shadda. 602 The striped marks are : 603 - FATHA, DAMMA, KASRA 604 - SUKUN 605 - SHADDA 606 - FATHATAN, DAMMATAN, KASRATAN, , , . 607 Example: 608 >>> text=u"الْعَرَبِيّةُ" 609 >>> stripTashkeel(text) 610 العربية 611 612 @param text: arabic text. 613 @type text: unicode. 614 @return: return a striped text. 615 @rtype: unicode. 616 """ 617 return re.sub(self.TASHKEEL_pattern,'',text);
618
619 - def stripTatweel(self,text):
620 """ 621 Strip tatweel from a text and return a result text. 622 623 Example: 624 >>> text=u"العـــــربية" 625 >>> stripTatweel(text) 626 العربية 627 628 @param text: arabic text. 629 @type text: unicode. 630 @return: return a striped text. 631 @rtype: unicode. 632 """ 633 return re.sub(self.TATWEEL,'',word);
634
635 - def normalizeLigature(self,text):
636 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text. 637 Some systems present lamAlef ligature as a single letter, this function convert it into two letters, 638 The converted letters into LAM and ALEF are : 639 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 640 641 Example: 642 >>> text=u"لانها لالء الاسلام" 643 >>> normalize_lamalef(text) 644 لانها لالئ الاسلام 645 646 @param text: arabic text. 647 @type text: unicode. 648 @return: return a converted text. 649 @rtype: unicode. 650 """ 651 return self.LIGUATURES_pattern.sub(u'%s%s'%(self.LAM,self.ALEF), text)
652 # #------------------------------------------------
653 - def vocalizedlike(self, word, vocalized):
654 """return True if the given word have the same or the partial vocalisation like the pattern vocalized 655 656 @param word: arabic word, full/partial vocalized. 657 @type word: unicode. 658 @param vocalized: arabic full vocalized word. 659 @type vocalized: unicode. 660 @return: True if vocalized. 661 @rtype: unicode. 662 """ 663 if not self.isVocalized(vocalized) or not self.isVocalized(word): 664 if self.isVocalized(vocalized): 665 vocalized=self.stripTashkeel(vocalized); 666 if self.isVocalized(word): 667 word=self.stripTashkeel(word); 668 if word==vocalized: 669 return True; 670 else: 671 return False; 672 else: 673 for mark in self.TASHKEEL: 674 vocalized=re.sub(u"[%s]"%mark,u"[%s]?"%mark,vocalized) 675 vocalized="^"+vocalized+"$"; 676 pat=re.compile("^"+vocalized+"$"); 677 if pat.match("^"+vocalized+"$",word): 678 return True; 679 else: return False;
680