Package pyarabic :: Module araby
[hide private]
[frames] | no frames]

Source Code for Module pyarabic.araby

  1  #!/usr/bin/python 
  2  # -*- coding=utf-8 -*- 
  3  #--- 
  4  # 
  5  # ------------ 
  6  # Description: 
  7  # ------------ 
  8  # 
  9  # Arabic codes 
 10  # 
 11  # (C) Copyright 2010, Taha Zerrouki 
 12  # ----------------- 
 13  #  $Date: 2010/03/01 
 14  #  $Author: Taha Zerrouki$ 
 15  #  $Revision: 0.1 $ 
 16  #  This program is written under the Gnu Public License. 
 17  # 
 18  """ 
 19  Arabic module 
 20  @author: Taha Zerrouki 
 21  @contact: taha dot zerrouki at gmail dot com 
 22  @copyright: Arabtechies, Arabeyes,  Taha Zerrouki 
 23  @license: GPL 
 24  @date:2010/03/01 
 25  @version: 0.1 
 26  """ 
 27  import re 
 28  from stack import * 
 29  #class araby: 
 30  """ 
 31  the arabic chars contains all arabic letters, a sub class of unicode, 
 32  """ 
 33   
 34  COMMA            = u'\u060C' 
 35  SEMICOLON        = u'\u061B' 
 36  QUESTION         = u'\u061F' 
 37  HAMZA            = u'\u0621' 
 38  ALEF_MADDA       = u'\u0622' 
 39  ALEF_HAMZA_ABOVE = u'\u0623' 
 40  WAW_HAMZA        = u'\u0624' 
 41  ALEF_HAMZA_BELOW = u'\u0625' 
 42  YEH_HAMZA        = u'\u0626' 
 43  ALEF             = u'\u0627' 
 44  BEH              = u'\u0628' 
 45  TEH_MARBUTA      = u'\u0629' 
 46  TEH              = u'\u062a' 
 47  THEH             = u'\u062b' 
 48  JEEM             = u'\u062c' 
 49  HAH              = u'\u062d' 
 50  KHAH             = u'\u062e' 
 51  DAL              = u'\u062f' 
 52  THAL             = u'\u0630' 
 53  REH              = u'\u0631' 
 54  ZAIN             = u'\u0632' 
 55  SEEN             = u'\u0633' 
 56  SHEEN            = u'\u0634' 
 57  SAD              = u'\u0635' 
 58  DAD              = u'\u0636' 
 59  TAH              = u'\u0637' 
 60  ZAH              = u'\u0638' 
 61  AIN              = u'\u0639' 
 62  GHAIN            = u'\u063a' 
 63  TATWEEL          = u'\u0640' 
 64  FEH              = u'\u0641' 
 65  QAF              = u'\u0642' 
 66  KAF              = u'\u0643' 
 67  LAM              = u'\u0644' 
 68  MEEM             = u'\u0645' 
 69  NOON             = u'\u0646' 
 70  HEH              = u'\u0647' 
 71  WAW              = u'\u0648' 
 72  ALEF_MAKSURA     = u'\u0649' 
 73  YEH              = u'\u064a' 
 74  MADDA_ABOVE      = u'\u0653' 
 75  HAMZA_ABOVE      = u'\u0654' 
 76  HAMZA_BELOW      = u'\u0655' 
 77  ZERO             = u'\u0660' 
 78  ONE              = u'\u0661' 
 79  TWO              = u'\u0662' 
 80  THREE            = u'\u0663' 
 81  FOUR             = u'\u0664' 
 82  FIVE             = u'\u0665' 
 83  SIX              = u'\u0666' 
 84  SEVEN            = u'\u0667' 
 85  EIGHT            = u'\u0668' 
 86  NINE             = u'\u0669' 
 87  PERCENT          = u'\u066a' 
 88  DECIMAL          = u'\u066b' 
 89  THOUSANDS        = u'\u066c' 
 90  STAR             = u'\u066d' 
 91  MINI_ALEF        = u'\u0670' 
 92  ALEF_WASLA       = u'\u0671' 
 93  FULL_STOP        = u'\u06d4' 
 94  BYTE_ORDER_MARK  = u'\ufeff' 
 95   
 96  # Diacritics 
 97  FATHATAN         = u'\u064b' 
 98  DAMMATAN         = u'\u064c' 
 99  KASRATAN         = u'\u064d' 
100  FATHA            = u'\u064e' 
101  DAMMA            = u'\u064f' 
102  KASRA            = u'\u0650' 
103  SHADDA           = u'\u0651' 
104  SUKUN            = u'\u0652' 
105   
106  # Small Letters 
107  SMALL_ALEF      =u"\u0670" 
108  SMALL_WAW       =u"\u06E5" 
109  SMALL_YEH       =u"\u06E6" 
110  #Ligatures 
111  LAM_ALEF                    =u'\ufefb' 
112  LAM_ALEF_HAMZA_ABOVE        =u'\ufef7' 
113  LAM_ALEF_HAMZA_BELOW        =u'\ufef9' 
114  LAM_ALEF_MADDA_ABOVE        =u'\ufef5' 
115  simple_LAM_ALEF             =u'\u0644\u0627' 
116  simple_LAM_ALEF_HAMZA_ABOVE =u'\u0644\u0623' 
117  simple_LAM_ALEF_HAMZA_BELOW =u'\u0644\u0625' 
118  simple_LAM_ALEF_MADDA_ABOVE =u'\u0644\u0622' 
119  # groups 
120  LETTERS=u''.join([ 
121          ALEF , BEH , TEH  , TEH_MARBUTA  , THEH  , JEEM  , HAH , KHAH , 
122          DAL   , THAL  , REH   , ZAIN  , SEEN   , SHEEN  , SAD , DAD , TAH   , ZAH   , 
123          AIN   , GHAIN   , FEH  , QAF , KAF , LAM , MEEM , NOON, HEH , WAW, YEH  , 
124          HAMZA  ,  ALEF_MADDA , ALEF_HAMZA_ABOVE , WAW_HAMZA   , ALEF_HAMZA_BELOW  , YEH_HAMZA  , 
125          ]) 
126   
127  TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN, 
128              FATHA,DAMMA,KASRA, 
129              SUKUN, 
130              SHADDA); 
131  HARAKAT =(  FATHATAN,   DAMMATAN,   KASRATAN, 
132              FATHA,  DAMMA,  KASRA, 
133              SUKUN 
134              ); 
135  SHORTHARAKAT =( FATHA,  DAMMA,  KASRA, SUKUN); 
136   
137  TANWIN =(FATHATAN,  DAMMATAN,   KASRATAN); 
138   
139   
140  LIGUATURES=( 
141              LAM_ALEF, 
142              LAM_ALEF_HAMZA_ABOVE, 
143              LAM_ALEF_HAMZA_BELOW, 
144              LAM_ALEF_MADDA_ABOVE, 
145              ); 
146  HAMZAT=( 
147              HAMZA, 
148              WAW_HAMZA, 
149              YEH_HAMZA, 
150              HAMZA_ABOVE, 
151              HAMZA_BELOW, 
152              ALEF_HAMZA_BELOW, 
153              ALEF_HAMZA_ABOVE, 
154              ); 
155  ALEFAT=( 
156              ALEF, 
157              ALEF_MADDA, 
158              ALEF_HAMZA_ABOVE, 
159              ALEF_HAMZA_BELOW, 
160              ALEF_WASLA, 
161              ALEF_MAKSURA, 
162              SMALL_ALEF, 
163   
164          ); 
165  WEAK   = ( ALEF, WAW, YEH, ALEF_MAKSURA); 
166  YEHLIKE= ( YEH,  YEH_HAMZA,  ALEF_MAKSURA,   SMALL_YEH  ); 
167   
168  WAWLIKE     =   ( WAW,  WAW_HAMZA,  SMALL_WAW ); 
169  TEHLIKE     =   ( TEH,  TEH_MARBUTA ); 
170   
171  SMALL   =( SMALL_ALEF, SMALL_WAW, SMALL_YEH) 
172  MOON =(HAMZA            , 
173          ALEF_MADDA       , 
174          ALEF_HAMZA_ABOVE , 
175          ALEF_HAMZA_BELOW , 
176          ALEF             , 
177          BEH              , 
178          JEEM             , 
179          HAH              , 
180          KHAH             , 
181          AIN              , 
182          GHAIN            , 
183          FEH              , 
184          QAF              , 
185          KAF              , 
186          MEEM             , 
187          HEH              , 
188          WAW              , 
189          YEH 
190      ); 
191  SUN=( 
192          TEH              , 
193          THEH             , 
194          DAL              , 
195          THAL             , 
196          REH              , 
197          ZAIN             , 
198          SEEN             , 
199          SHEEN            , 
200          SAD              , 
201          DAD              , 
202          TAH              , 
203          ZAH              , 
204          LAM              , 
205          NOON             , 
206      ); 
207  AlphabeticOrder={ 
208                  ALEF             : 1, 
209                  BEH              : 2, 
210                  TEH              : 3, 
211                  TEH_MARBUTA      : 3, 
212                  THEH             : 4, 
213                  JEEM             : 5, 
214                  HAH              : 6, 
215                  KHAH             : 7, 
216                  DAL              : 8, 
217                  THAL             : 9, 
218                  REH              : 10, 
219                  ZAIN             : 11, 
220                  SEEN             : 12, 
221                  SHEEN            : 13, 
222                  SAD              : 14, 
223                  DAD              : 15, 
224                  TAH              : 16, 
225                  ZAH              : 17, 
226                  AIN              : 18, 
227                  GHAIN            : 19, 
228                  FEH              : 20, 
229                  QAF              : 21, 
230                  KAF              : 22, 
231                  LAM              : 23, 
232                  MEEM             : 24, 
233                  NOON             : 25, 
234                  HEH              : 26, 
235                  WAW              : 27, 
236                  YEH              : 28, 
237                  HAMZA            : 29, 
238   
239                  ALEF_MADDA       : 29, 
240                  ALEF_HAMZA_ABOVE : 29, 
241                  WAW_HAMZA        : 29, 
242                  ALEF_HAMZA_BELOW : 29, 
243                  YEH_HAMZA        : 29, 
244                  } 
245  NAMES ={ 
246                  ALEF             :  u"ألف", 
247                  BEH              : u"باء", 
248                  TEH              : u'تاء' , 
249                  TEH_MARBUTA      : u'تاء مربوطة' , 
250                  THEH             : u'ثاء' , 
251                  JEEM             : u'جيم' , 
252                  HAH              : u'حاء' , 
253                  KHAH             : u'خاء' , 
254                  DAL              : u'دال' , 
255                  THAL             : u'ذال' , 
256                  REH              : u'راء' , 
257                  ZAIN             : u'زاي' , 
258                  SEEN             : u'سين' , 
259                  SHEEN            : u'شين' , 
260                  SAD              : u'صاد' , 
261                  DAD              : u'ضاد' , 
262                  TAH              : u'طاء' , 
263                  ZAH              : u'ظاء' , 
264                  AIN              : u'عين' , 
265                  GHAIN            : u'غين' , 
266                  FEH              : u'فاء' , 
267                  QAF              : u'قاف' , 
268                  KAF              : u'كاف' , 
269                  LAM              : u'لام' , 
270                  MEEM             : u'ميم' , 
271                  NOON             : u'نون' , 
272                  HEH              : u'هاء' , 
273                  WAW              : u'واو' , 
274                  YEH              : u'ياء' , 
275                  HAMZA            : u'همزة' , 
276   
277                  TATWEEL          : u'تطويل' , 
278                  ALEF_MADDA       : u'ألف ممدودة' , 
279                  ALEF_MAKSURA      : u'ألف مقصورة' , 
280                  ALEF_HAMZA_ABOVE : u'همزة على الألف' , 
281                  WAW_HAMZA        : u'همزة على الواو' , 
282                  ALEF_HAMZA_BELOW : u'همزة تحت الألف' , 
283                  YEH_HAMZA        : u'همزة على الياء' , 
284                  FATHATAN         : u'فتحتان', 
285                  DAMMATAN         : u'ضمتان', 
286                  KASRATAN         : u'كسرتان', 
287                  FATHA            : u'فتحة', 
288                  DAMMA            : u'ضمة', 
289                  KASRA            : u'كسرة', 
290                  SHADDA           : u'شدة', 
291                  SUKUN            : u'سكون', 
292                  } 
293   
294  # regular expretion 
295  HARAKAT_pattern =re.compile(ur"["+u"".join(HARAKAT)+u"]") 
296  TASHKEEL_pattern =re.compile(ur"["+u"".join(TASHKEEL)+u"]") 
297  HAMZAT_pattern =re.compile(ur"["+u"".join(HAMZAT)+u"]"); 
298  ALEFAT_pattern =re.compile(ur"["+u"".join(ALEFAT)+u"]"); 
299  LIGUATURES_pattern =re.compile(ur"["+u"".join(LIGUATURES)+u"]"); 
300   
301  ################################################ 
302  #{ is letter functions 
303  ################################################ 
304 -def isSukun(archar):
305 """Checks for Arabic Sukun Mark. 306 @param archar: arabic unicode char 307 @type archar: unicode 308 """ 309 if archar==SUKUN: 310 return True; 311 else: return False;
312
313 -def isShadda(archar):
314 """Checks for Arabic Shadda Mark. 315 @param archar: arabic unicode char 316 @type archar: unicode 317 """ 318 if archar==SHADDA: 319 return True; 320 else: return False;
321
322 -def isTatweel(archar):
323 """Checks for Arabic Tatweel letter modifier. 324 @param archar: arabic unicode char 325 @type archar: unicode 326 """ 327 if archar==TATWEEL: 328 return True; 329 else: return False;
330 -def isTanwin(archar):
331 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN). 332 @param archar: arabic unicode char 333 @type archar: unicode 334 """ 335 if archar in TANWIN: 336 return True; 337 else: return False;
338
339 -def isTashkeel(archar):
340 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn). 341 @param archar: arabic unicode char 342 @type archar: unicode 343 """ 344 if archar in TASHKEEL: 345 return True; 346 else: return False;
347
348 -def isHaraka(archar):
349 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN). 350 @param archar: arabic unicode char 351 @type archar: unicode 352 """ 353 if archar in HARAKAT: 354 return True; 355 else: return False;
356
357 -def isShortharaka(archar):
358 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN). 359 @param archar: arabic unicode char 360 @type archar: unicode 361 """ 362 if archar in SHORTHARAKAT: 363 return True; 364 else: return False;
365
366 -def isLigature(archar):
367 """Checks for Arabic Ligatures like LamAlef. 368 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE) 369 @param archar: arabic unicode char 370 @type archar: unicode 371 """ 372 if archar in LIGUATURES: 373 return True; 374 else: return False;
375
376 -def isHamza(archar):
377 """Checks for Arabic Hamza forms. 378 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE ) 379 @param archar: arabic unicode char 380 @type archar: unicode 381 """ 382 if archar in HAMZAT: 383 return True; 384 else: return False;
385
386 -def isAlef(archar):
387 """Checks for Arabic Alef forms. 388 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA ); 389 @param archar: arabic unicode char 390 @type archar: unicode 391 """ 392 if archar in ALEFAT: 393 return True; 394 else: return False;
395
396 -def isYehlike(archar):
397 """Checks for Arabic Yeh forms. 398 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA 399 @param archar: arabic unicode char 400 @type archar: unicode 401 """ 402 if archar in YEHLIKE: 403 return True; 404 else: return False;
405
406 -def isWawlike(archar):
407 """Checks for Arabic Waw like forms. 408 Waw forms : WAW, WAW_HAMZA, SMALL_WAW 409 @param archar: arabic unicode char 410 @type archar: unicode 411 """ 412 if archar in WAWLIKE: 413 return True; 414 else: return False;
415
416 -def isTeh(archar):
417 """Checks for Arabic Teh forms. 418 Teh forms : TEH, TEH_MARBUTA 419 @param archar: arabic unicode char 420 @type archar: unicode 421 """ 422 if archar in TEHLIKE: 423 return True; 424 else: return False;
425 -def isSmall(archar):
426 """Checks for Arabic Small letters. 427 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH 428 @param archar: arabic unicode char 429 @type archar: unicode 430 """ 431 if archar in SMALL: 432 return True; 433 else: return False;
434
435 -def isWeak(archar):
436 """Checks for Arabic Weak letters. 437 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA 438 @param archar: arabic unicode char 439 @type archar: unicode 440 """ 441 if archar in WEAK: 442 return True; 443 else: return False;
444
445 -def isMoon(archar):
446 """Checks for Arabic Moon letters. 447 Moon Letters : 448 @param archar: arabic unicode char 449 @type archar: unicode 450 """ 451 452 if archar in MOON: 453 return True; 454 else: return False;
455
456 -def isSun(archar):
457 """Checks for Arabic Sun letters. 458 Moon Letters : 459 @param archar: arabic unicode char 460 @type archar: unicode 461 """ 462 if archar in SUN: 463 return True; 464 else: return False;
465 ##################################### 466 #{ general letter functions 467 #####################################
468 -def order(archar):
469 """return Arabic letter order between 1 and 29. 470 Alef order is 1, Yeh is 28, Hamza is 29. 471 Teh Marbuta has the same ordre with Teh, 3. 472 @param archar: arabic unicode char 473 @type archar: unicode 474 @return: arabic order. 475 @rtype: integer; 476 """ 477 if AlphabeticOrder.has_key(archar): 478 return AlphabeticOrder[archar]; 479 else: return 0;
480
481 -def name(archar):
482 """return Arabic letter name in arabic. 483 Alef order is 1, Yeh is 28, Hamza is 29. 484 Teh Marbuta has the same ordre with Teh, 3. 485 @param archar: arabic unicode char 486 @type archar: unicode 487 @return: arabic name. 488 @rtype: unicode; 489 """ 490 if NAMES.has_key(archar): 491 return NAMES[archar]; 492 else: 493 return u'';
494
495 -def arabicrange(self):
496 """return a list of arabic characteres . 497 Return a list of characteres between \u060c to \u0652 498 @return: list of arabic characteres. 499 @rtype: unicode; 500 """ 501 mylist=[]; 502 for i in range(0x0600, 0x00653): 503 try : 504 mylist.append(unichr(i)); 505 except ValueError: 506 pass; 507 return mylist;
508 509 510 ##################################### 511 #{ Has letter functions 512 #####################################
513 -def hasShadda(word):
514 """Checks if the arabic word contains shadda. 515 @param word: arabic unicode char 516 @type word: unicode 517 """ 518 if re.search(SHADDA,word): 519 return True; 520 else: 521 return False;
522 523 ##################################### 524 #{ word and text functions 525 #####################################
526 -def isVocalized(word):
527 """Checks if the arabic word is vocalized. 528 the word musn't have any spaces and pounctuations. 529 @param word: arabic unicode char 530 @type word: unicode 531 """ 532 if word.isalpha(): return False; 533 # n (FATHA,DAMMAN,KASRA): 534 else: 535 if re.search(HARAKAT_pattern,word): 536 return True; 537 else: 538 return False;
539 -def isVocalizedtext(text):
540 """Checks if the arabic text is vocalized. 541 The text can contain many words and spaces 542 @param text: arabic unicode char 543 @type text: unicode 544 """ 545 if re.search(HARAKAT_pattern,text): 546 return True; 547 else: 548 return False;
549 -def isArabicstring(text):
550 """ Checks for an Arabic standard Unicode block characters; 551 An arabic string can contain spaces, digits and pounctuation. 552 but only arabic standard characters, not extended arabic 553 @param text: input text 554 @type text: unicode 555 @return: True if all charaters are in Arabic block 556 @rtype: Boolean 557 """ 558 if re.search(u"([^\u0600-\u0652%s%s%s\s\d])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),text): 559 return False; 560 return True;
561
562 -def isArabicrange(text):
563 """ Checks for an Arabic Unicode block characters; 564 @param text: input text 565 @type text: unicode 566 @return: True if all charaters are in Arabic block 567 @rtype: Boolean 568 """ 569 if re.search(u"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])",text): 570 return False; 571 return True;
572
573 -def isArabicword(word):
574 """ Checks for an valid Arabic word. 575 An Arabic word not contains spaces, digits and pounctuation 576 avoid some spelling error, TEH_MARBUTA must be at the end. 577 @param word: input word 578 @type word: unicode 579 @return: True if all charaters are in Arabic block 580 @rtype: Boolean 581 """ 582 if len(word)==0 : return False; 583 elif re.search(u"([^\u0600-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word): 584 return False; 585 elif isHaraka(word[0]) or word[0] in (WAW_HAMZA,YEH_HAMZA): 586 return False; 587 # if Teh Marbuta or Alef_Maksura not in the end 588 elif re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word): 589 return False; 590 elif re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word): 591 return False; 592 else: 593 return True;
594 595 ##################################### 596 #{Strip functions 597 #####################################
598 -def stripHarakat(text):
599 """Strip Harakat from arabic word except Shadda. 600 The striped marks are : 601 - FATHA, DAMMA, KASRA 602 - SUKUN 603 - FATHATAN, DAMMATAN, KASRATAN, , , . 604 Example: 605 >>> text=u"الْعَرَبِيّةُ" 606 >>> stripTashkeel(text) 607 العربيّة 608 609 @param text: arabic text. 610 @type text: unicode. 611 @return: return a striped text. 612 @rtype: unicode. 613 """ 614 return re.sub(HARAKAT_pattern,u'',text)
615
616 -def stripTashkeel(text):
617 """Strip vowels from a text, include Shadda. 618 The striped marks are : 619 - FATHA, DAMMA, KASRA 620 - SUKUN 621 - SHADDA 622 - FATHATAN, DAMMATAN, KASRATAN, , , . 623 Example: 624 >>> text=u"الْعَرَبِيّةُ" 625 >>> stripTashkeel(text) 626 العربية 627 628 @param text: arabic text. 629 @type text: unicode. 630 @return: return a striped text. 631 @rtype: unicode. 632 """ 633 return re.sub(TASHKEEL_pattern,'',text);
634
635 -def stripTatweel(text):
636 """ 637 Strip tatweel from a text and return a result text. 638 639 Example: 640 >>> text=u"العـــــربية" 641 >>> stripTatweel(text) 642 العربية 643 644 @param text: arabic text. 645 @type text: unicode. 646 @return: return a striped text. 647 @rtype: unicode. 648 """ 649 return re.sub(TATWEEL,'',text);
650
651 -def normalizeLigature(text):
652 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text. 653 Some systems present lamAlef ligature as a single letter, this function convert it into two letters, 654 The converted letters into LAM and ALEF are : 655 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE 656 657 Example: 658 >>> text=u"لانها لالء الاسلام" 659 >>> normalize_lamalef(text) 660 لانها لالئ الاسلام 661 662 @param text: arabic text. 663 @type text: unicode. 664 @return: return a converted text. 665 @rtype: unicode. 666 """ 667 return LIGUATURES_pattern.sub(u'%s%s'%(LAM,ALEF), text)
668 669
670 -def separate(word):
671 """ 672 separate the letters from the vowels, in arabic word, 673 if a letter hasn't a haraka, the not definited haraka is attributed. 674 return ( letters,vowels); 675 """ 676 #debug=True; 677 stack1=Stack(word) 678 # the word is inversed in the stack 679 stack1.items.reverse(); 680 letters=Stack() 681 marks=Stack() 682 vowels=HARAKAT 683 last1=stack1.pop(); 684 # if the last element must be a letter, 685 # the arabic word can't starts with a haraka 686 # in th stack the word is inversed 687 while last1 in vowels: last1=stack1.pop(); 688 while last1!=None: 689 if last1 in vowels: 690 # we can't have two harakats beside. 691 # the shadda is considered as a letter 692 marks.pop(); 693 marks.push(last1); 694 elif last1==SHADDA: 695 # is the element is a Shadda, 696 # the previous letter must have a sukun as mark, 697 # and the shadda take the indefinate mark 698 marks.pop(); 699 marks.push(SUKUN); 700 marks.push(NOT_DEF_HARAKA); 701 letters.push(SHADDA); 702 else: 703 marks.push(NOT_DEF_HARAKA); 704 letters.push(last1); 705 last1=stack1.pop(); 706 return (''.join(letters.items),''.join(marks.items))
707 708
709 -def joint(letters,marks):
710 """ 711 joint the letters with the marks 712 the length ot letters and marks must be equal 713 return word; 714 """ 715 #debug=True; 716 debug=False; 717 # The length ot letters and marks must be equal 718 if len(letters)!=len(marks): return ""; 719 720 stackLetter=Stack(letters) 721 stackLetter.items.reverse(); 722 stackMark=Stack(marks) 723 stackMark.items.reverse(); 724 wordStack=Stack(); 725 last1=stackLetter.pop(); 726 last2=stackMark.pop(); 727 728 vowels=HARAKAT 729 while last1!=None and last2!=None: 730 if last1 == SHADDA: 731 top=wordStack.pop(); 732 if top not in vowels: 733 wordStack.push(top); 734 wordStack.push(last1); 735 if last2!= NOT_DEF_HARAKA: 736 wordStack.push(last2); 737 else: 738 wordStack.push(last1); 739 if last2!= NOT_DEF_HARAKA: 740 wordStack.push(last2); 741 742 last1=stackLetter.pop(); 743 last2=stackMark.pop(); 744 if not (stackLetter.isEmpty() and stackMark.isEmpty()): 745 return False; 746 else: 747 #wordStack.items.reverse(); 748 return ''.join(wordStack.items);
749
750 -def vocalizedlike(word1,word2):
751 """ 752 if the two words has the same letters and the same harakats, this fuction return True. 753 The two words can be full vocalized, or partial vocalized 754 """ 755 debug=False; 756 stack1=Stack(word1) 757 stack2=Stack(word2) 758 last1=stack1.pop(); 759 last2=stack2.pop(); 760 if debug: print "+0", stack1, stack2; 761 vowels=HARAKAT 762 while last1!=None and last2!=None: 763 if last1==last2: 764 if debug: print "+2", stack1.items,last1, stack2.items,last2 765 last1=stack1.pop(); 766 last2=stack2.pop(); 767 elif last1 in vowels and last2 not in vowels: 768 if debug: print "+2", stack1.items,last1, stack2.items,last2 769 last1=stack1.pop(); 770 elif last1 not in vowels and last2 in vowels: 771 if debug: print "+2", stack1.items,last1, stack2.items,last2 772 last2=stack2.pop(); 773 else: 774 if debug: print "+2", stack1.items,last1, stack2.items,last2 775 break; 776 if not (stack1.isEmpty() and stack2.isEmpty()): 777 return False; 778 else: return True;
779 #------------------------- 780 # Function def vaznlike(word1,wazn): 781 #-------------------------
782 -def waznlike(word1,wazn):
783 """ 784 if the word1 is like a wazn (pattern), 785 the letters must be equal, 786 the wazn has FEH, AIN, LAM letters. 787 this are as generic letters. 788 The two words can be full vocalized, or partial vocalized 789 """ 790 debug=False; 791 stack1=Stack(word1) 792 stack2=Stack(wazn) 793 root=Stack() 794 last1=stack1.pop(); 795 last2=stack2.pop(); 796 if debug: print "+0", stack1, stack2; 797 vowels=HARAKAT 798 while last1!=None and last2!=None: 799 if last1==last2 and last2 not in (FEH, AIN,LAM): 800 if debug: print "+2", stack1.items,last1, stack2.items,last2 801 last1=stack1.pop(); 802 last2=stack2.pop(); 803 elif last1 not in vowels and last2 in (FEH, AIN,LAM): 804 if debug: print "+2", stack1.items,last1, stack2.items,last2 805 root.push(last1); 806 print "t"; 807 last1=stack1.pop(); 808 last2=stack2.pop(); 809 elif last1 in vowels and last2 not in vowels: 810 if debug: print "+2", stack1.items,last1, stack2.items,last2 811 last1=stack1.pop(); 812 elif last1 not in vowels and last2 in vowels: 813 if debug: print "+2", stack1.items,last1, stack2.items,last2 814 last2=stack2.pop(); 815 else: 816 if debug: print "+2", stack1.items,last1, stack2.items,last2 817 break; 818 # reverse the root letters 819 root.items.reverse(); 820 print " the root is ", root.items#"".join(root.items); 821 if not (stack1.isEmpty() and stack2.isEmpty()): 822 return False; 823 else: return True;
824
825 -def shaddalike(partial,fully):
826 """ 827 if the two words has the same letters and the same harakats, this fuction return True. 828 The first word is partially vocalized, the second is fully 829 if the partially contians a shadda, it must be at the same place in the fully 830 """ 831 debug=False; 832 partial=stripHarakat(partial); 833 fully=stripHarakat(fully) 834 Pstack=Stack(partial) 835 Vstack=Stack(fully) 836 Plast=Pstack.pop(); 837 Vlast=Vstack.pop(); 838 if debug: print "+0", Pstack, Vstack; 839 vowels=SHADDA 840 while Plast!=None and Vlast!=None: 841 if Plast==Vlast: 842 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 843 Plast=Pstack.pop(); 844 Vlast=Vstack.pop(); 845 elif Plast ==SHADDA and Vlast !=SHADDA: 846 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 847 break; 848 elif Plast !=SHADDA and Vlast ==SHADDA: 849 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 850 Vlast=Vstack.pop(); 851 else: 852 if debug: print "+2", Pstack.items,Plast, Vstack.items,Vlast 853 break; 854 if not (Pstack.isEmpty() and Vstack.isEmpty()): 855 return False; 856 else: return True;
857