1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """
19 Arabic module
20 """
21 import re
23 """
24 the arabic chars contains all arabic letters, a sub class of unicode,
25 """
26
27 COMMA = u'\u060C'
28 SEMICOLON = u'\u061B'
29 QUESTION = u'\u061F'
30 HAMZA = u'\u0621'
31 ALEF_MADDA = u'\u0622'
32 ALEF_HAMZA_ABOVE = u'\u0623'
33 WAW_HAMZA = u'\u0624'
34 ALEF_HAMZA_BELOW = u'\u0625'
35 YEH_HAMZA = u'\u0626'
36 ALEF = u'\u0627'
37 BEH = u'\u0628'
38 TEH_MARBUTA = u'\u0629'
39 TEH = u'\u062a'
40 THEH = u'\u062b'
41 JEEM = u'\u062c'
42 HAH = u'\u062d'
43 KHAH = u'\u062e'
44 DAL = u'\u062f'
45 THAL = u'\u0630'
46 REH = u'\u0631'
47 ZAIN = u'\u0632'
48 SEEN = u'\u0633'
49 SHEEN = u'\u0634'
50 SAD = u'\u0635'
51 DAD = u'\u0636'
52 TAH = u'\u0637'
53 ZAH = u'\u0638'
54 AIN = u'\u0639'
55 GHAIN = u'\u063a'
56 TATWEEL = u'\u0640'
57 FEH = u'\u0641'
58 QAF = u'\u0642'
59 KAF = u'\u0643'
60 LAM = u'\u0644'
61 MEEM = u'\u0645'
62 NOON = u'\u0646'
63 HEH = u'\u0647'
64 WAW = u'\u0648'
65 ALEF_MAKSURA = u'\u0649'
66 YEH = u'\u064a'
67 MADDA_ABOVE = u'\u0653'
68 HAMZA_ABOVE = u'\u0654'
69 HAMZA_BELOW = u'\u0655'
70 ZERO = u'\u0660'
71 ONE = u'\u0661'
72 TWO = u'\u0662'
73 THREE = u'\u0663'
74 FOUR = u'\u0664'
75 FIVE = u'\u0665'
76 SIX = u'\u0666'
77 SEVEN = u'\u0667'
78 EIGHT = u'\u0668'
79 NINE = u'\u0669'
80 PERCENT = u'\u066a'
81 DECIMAL = u'\u066b'
82 THOUSANDS = u'\u066c'
83 STAR = u'\u066d'
84 MINI_ALEF = u'\u0670'
85 ALEF_WASLA = u'\u0671'
86 FULL_STOP = u'\u06d4'
87 BYTE_ORDER_MARK = u'\ufeff'
88
89
90 FATHATAN = u'\u064b'
91 DAMMATAN = u'\u064c'
92 KASRATAN = u'\u064d'
93 FATHA = u'\u064e'
94 DAMMA = u'\u064f'
95 KASRA = u'\u0650'
96 SHADDA = u'\u0651'
97 SUKUN = u'\u0652'
98
99
100 SMALL_ALEF =u"\u0670"
101 SMALL_WAW =u"\u06E5"
102 SMALL_YEH =u"\u06E6"
103
104 LAM_ALEF =u'\ufefb'
105 LAM_ALEF_HAMZA_ABOVE =u'\ufef7'
106 LAM_ALEF_HAMZA_BELOW =u'\ufef9'
107 LAM_ALEF_MADDA_ABOVE =u'\ufef5'
108 simple_LAM_ALEF =u'\u0644\u0627'
109 simple_LAM_ALEF_HAMZA_ABOVE =u'\u0644\u0623'
110 simple_LAM_ALEF_HAMZA_BELOW =u'\u0644\u0625'
111 simple_LAM_ALEF_MADDA_ABOVE =u'\u0644\u0622'
112
113 LETTERS=u''.join([
114 ALEF , BEH , TEH , TEH_MARBUTA , THEH , JEEM , HAH , KHAH ,
115 DAL , THAL , REH , ZAIN , SEEN , SHEEN , SAD , DAD , TAH , ZAH ,
116 AIN , GHAIN , FEH , QAF , KAF , LAM , MEEM , NOON, HEH , WAW, YEH ,
117 HAMZA , ALEF_MADDA , ALEF_HAMZA_ABOVE , WAW_HAMZA , ALEF_HAMZA_BELOW , YEH_HAMZA ,
118 ])
119
120 TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN,
121 FATHA,DAMMA,KASRA,
122 SUKUN,
123 SHADDA);
124 HARAKAT =( FATHATAN, DAMMATAN, KASRATAN,
125 FATHA, DAMMA, KASRA,
126 SUKUN
127 );
128 SHORTHARAKAT =( FATHA, DAMMA, KASRA, SUKUN);
129
130 TANWIN =(FATHATAN, DAMMATAN, KASRATAN);
131
132
133 LIGUATURES=(
134 LAM_ALEF,
135 LAM_ALEF_HAMZA_ABOVE,
136 LAM_ALEF_HAMZA_BELOW,
137 LAM_ALEF_MADDA_ABOVE,
138 );
139 HAMZAT=(
140 HAMZA,
141 WAW_HAMZA,
142 YEH_HAMZA,
143 HAMZA_ABOVE,
144 HAMZA_BELOW,
145 ALEF_HAMZA_BELOW,
146 ALEF_HAMZA_ABOVE,
147 );
148 ALEFAT=(
149 ALEF,
150 ALEF_MADDA,
151 ALEF_HAMZA_ABOVE,
152 ALEF_HAMZA_BELOW,
153 ALEF_WASLA,
154 ALEF_MAKSURA,
155 SMALL_ALEF,
156
157 );
158 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA);
159 YEHLIKE= ( YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH );
160
161 WAWLIKE = ( WAW, WAW_HAMZA, SMALL_WAW );
162 TEHLIKE = ( TEH, TEH_MARBUTA );
163
164 SMALL =( SAMLL_ALEF, SMALL_WAW, SMALL_YEH)
165 MOON=(
166 HAMZA ,
167 ALEF_MADDA ,
168 ALEF_HAMZA_ABOVE ,
169 ALEF_HAMZA_BELOW ,
170 ALEF ,
171 BEH ,
172 JEEM ,
173 HAH ,
174 KHAH ,
175 AIN ,
176 GHAIN ,
177 FEH ,
178 QAF ,
179 KAF ,
180 MEEM ,
181 HEH ,
182 WAW ,
183 YEH
184 ),
185 SUN=(
186 TEH ,
187 THEH ,
188 DAL ,
189 THAL ,
190 REH ,
191 ZAIN ,
192 SEEN ,
193 SHEEN ,
194 SAD ,
195 DAD ,
196 TAH ,
197 ZAH ,
198 LAM ,
199 NOON ,
200 );
201 AlphabeticOrder={
202 ALEF : 1,
203 BEH : 2,
204 TEH : 3,
205 TEH_MARBUTA : 3,
206 THEH : 4,
207 JEEM : 5,
208 HAH : 6,
209 KHAH : 7,
210 DAL : 8,
211 THAL : 9,
212 REH : 10,
213 ZAIN : 11,
214 SEEN : 12,
215 SHEEN : 13,
216 SAD : 14,
217 DAD : 15,
218 TAH : 16,
219 ZAH : 17,
220 AIN : 18,
221 GHAIN : 19,
222 FEH : 20,
223 QAF : 21,
224 KAF : 22,
225 LAM : 23,
226 MEEM : 24,
227 NOON : 25,
228 HEH : 26,
229 WAW : 27,
230 YEH : 28,
231 HAMZA : 29,
232
233 ALEF_MADDA : 29,
234 ALEF_HAMZA_ABOVE : 29,
235 WAW_HAMZA : 29,
236 ALEF_HAMZA_BELOW : 29,
237 YEH_HAMZA : 29,
238 }
239 NAMES -{
240 ALEF : u"ألف",
241 BEH : u"باء",
242 TEH : u'تاء' ,
243 TEH_MARBUTA : u'تاء مربوطة' ,
244 THEH : u'ثاء' ,
245 JEEM : u'جيم' ,
246 HAH : u'حاء' ,
247 KHAH : u'خاء' ,
248 DAL : u'دال' ,
249 THAL : u'ذال' ,
250 REH : u'راء' ,
251 ZAIN : u'زاي' ,
252 SEEN : u'سين' ,
253 SHEEN : u'شين' ,
254 SAD : u'صاد' ,
255 DAD : u'ضاد' ,
256 TAH : u'طاء' ,
257 ZAH : u'ظاء' ,
258 AIN : u'عين' ,
259 GHAIN : u'غين' ,
260 FEH : u'فاء' ,
261 QAF : u'قاف' ,
262 KAF : u'كاف' ,
263 LAM : u'لام' ,
264 MEEM : u'ميم' ,
265 NOON : u'نون' ,
266 HEH : u'هاء' ,
267 WAW : u'واو' ,
268 YEH : u'ياء' ,
269 HAMZA : u'همزة' ,
270
271 ALEF_MADDA : u'ألف ممدودة' ,
272 ALEF_HAMZA_ABOVE : u'همزة على الألف' ,
273 WAW_HAMZA : u'همزة على الواو' ,
274 ALEF_HAMZA_BELOW : u'همزة تحت الألف' ,
275 YEH_HAMZA : u'همزة على الياء' ,
276 FATHATAN : u'فتحتان',
277 DAMMATAN : u'ضمتان',
278 KASRATAN : u'كسرتان',
279 FATHA : u'فتحة',
280 DAMMA : u'ضمة',
281 KASRA : u'كسرة',
282 SHADDA : u'شدة',
283 SUKUN : u'سكون',
284 }
285
286
287 HARAKAT_pattern =re.compile(ur"["+u"".join(HARAKAT)+u"]")
288 HARAKAT_pattern =re.compile(ur"["+u"".join(TASHKEEL)+u"]")
289 HAMZAT_pattern =re.compile(ur"["+u"".join(HAMZAT)+u"]");
290 ALEFAT_pattern =re.compile(ur"["+u"".join(ALEFAT)+u"]");
291 LIGUATURES_pattern =re.compile(ur"["+u"".join(LIGATURES)+u"]");
292
295
296
297
298
300 """Checks for Arabic Sukun Mark.
301 @param archar: arabic unicode char
302 @type archar: unicode
303 """
304 if archar==self.SUKUN:
305 return True;
306 else: return False;
307
309 """Checks for Arabic Shadda Mark.
310 @param archar: arabic unicode char
311 @type archar: unicode
312 """
313 if archar==self.SHADDA:
314 return True;
315 else: return False;
316
318 """Checks for Arabic Tatweel letter modifier.
319 @param archar: arabic unicode char
320 @type archar: unicode
321 """
322 if archar==self.TATWEEL:
323 return True;
324 else: return False;
326 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).
327 @param archar: arabic unicode char
328 @type archar: unicode
329 """
330 if archar in self.TANWIN:
331 return True;
332 else: return False;
333
335 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn).
336 @param archar: arabic unicode char
337 @type archar: unicode
338 """
339
340 if archar in self.TASHKEEL:
341 return True;
342 else: return False;
343
345 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN).
346 @param archar: arabic unicode char
347 @type archar: unicode
348 """
349 if archar in self.HARAKAT:
350 return True;
351 else: return False;
352
354 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN).
355 @param archar: arabic unicode char
356 @type archar: unicode
357 """
358 if archar in self.SHORTHARAKAT:
359 return True;
360 else: return False;
361
363 """Checks for Arabic Ligatures like LamAlef.
364 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
365 @param archar: arabic unicode char
366 @type archar: unicode
367 """
368 if archar in self.LIGUATURES:
369 return True;
370 else: return False;
371
373 """Checks for Arabic Hamza forms.
374 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE )
375 @param archar: arabic unicode char
376 @type archar: unicode
377 """
378 if archar in self.HAMZAT:
379 return True;
380 else: return False;
381
383 """Checks for Arabic Alef forms.
384 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA );
385 @param archar: arabic unicode char
386 @type archar: unicode
387 """
388 if archar in self.ALEFAT:
389 return True;
390 else: return False;
391
393 """Checks for Arabic Yeh forms.
394 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
395 @param archar: arabic unicode char
396 @type archar: unicode
397 """
398 if archar in self.YEHLIKE:
399 return True;
400 else: return False;
401
403 """Checks for Arabic Waw like forms.
404 Waw forms : WAW, WAW_HAMZA, SMALL_WAW
405 @param archar: arabic unicode char
406 @type archar: unicode
407 """
408 if archar in self.WAWLIKE:
409 return True;
410 else: return False;
411
413 """Checks for Arabic Teh forms.
414 Teh forms : TEH, TEH_MARBUTA
415 @param archar: arabic unicode char
416 @type archar: unicode
417 """
418 if archar in self.TEHLIKE:
419 return True;
420 else: return False;
422 """Checks for Arabic Small letters.
423 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
424 @param archar: arabic unicode char
425 @type archar: unicode
426 """
427 if archar in self.SMALL:
428 return True;
429 else: return False;
430
432 """Checks for Arabic Weak letters.
433 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
434 @param archar: arabic unicode char
435 @type archar: unicode
436 """
437 if archar in self.WEAK:
438 return True;
439 else: return False;
440
442 """Checks for Arabic Moon letters.
443 Moon Letters :
444 @param archar: arabic unicode char
445 @type archar: unicode
446 """
447 if archar in self.MOON:
448 return True;
449 else: return False;
450
452 """Checks for Arabic Sun letters.
453 Moon Letters :
454 @param archar: arabic unicode char
455 @type archar: unicode
456 """
457 if archar in self.SUN:
458 return True;
459 else: return False;
460
461
462
464 """return Arabic letter order between 1 and 29.
465 Alef order is 1, Yeh is 28, Hamza is 29.
466 Teh Marbuta has the same ordre with Teh, 3.
467 @param archar: arabic unicode char
468 @type archar: unicode
469 @return: arabic order.
470 @rtype: integer;
471 """
472 if self.AlphabeticOrder.has_key(archar):
473 return self.ArabicOrder[archar];
474 else: return 0;
475
476 - def name(self,archar):
477 """return Arabic letter name in arabic.
478 Alef order is 1, Yeh is 28, Hamza is 29.
479 Teh Marbuta has the same ordre with Teh, 3.
480 @param archar: arabic unicode char
481 @type archar: unicode
482 @return: arabic name.
483 @rtype: unicode;
484 """
485 if self.NAMES.has_key(archar):
486 return self.ArabicOrder[archar];
487 else: return 0;
488
490 """return a list of arabic characteres .
491 Return a list of characteres between \u060c to \u0652
492 @return: list of arabic characteres.
493 @rtype: unicode;
494 """
495 myslist=[];
496 for i in range(0x0600, 0x00652):
497 try :
498 mylist.append=unichr(i);
499 except ValueError:
500 pass;
501 return mylist;
502
503
504
505
506
508 """Checks if the arabic word contains shadda.
509 @param word: arabic unicode char
510 @type word: unicode
511 """
512 if re.search(self.SHADDA,word):
513 return True;
514 else:
515 return False;
516
517
518
519
521 """Checks if the arabic word is vocalized.
522 the word musn't have any spaces and pounctuations.
523 @param word: arabic unicode char
524 @type word: unicode
525 """
526 if word.isalpha(): return False;
527
528 else:
529 if re.search(self.HARAKAT_pattern,word):
530 return True;
531 else:
532 return False;
533 - def isVocalizedtext(self,text):
534 """Checks if the arabic text is vocalized.
535 The text can contain many words and spaces
536 @param text: arabic unicode char
537 @type text: unicode
538 """
539 if re.search(self.HARAKAT_pattern,word):
540 return True;
541 else:
542 return False;
544 """ Checks for an Arabic Unicode block characters;
545 @param text: input text
546 @type text: unicode
547 @return: True if all charaters are in Arabic block
548 @rtype: Boolean
549 """
550 pass;
551 if len(word)==0: return False;
552 word_nm=ar_strip_marks_keepshadda(word);
553
554 word_nm=word_nm.replace(ALEF_MADDA,HAMZA+ALEF);
555 if word[0] in (WAW_HAMZA,YEH_HAMZA,FATHA,DAMMA,SUKUN,KASRA):
556 return False;
557
558 if re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word):
559 return False;
560 if re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word):
561 return False;
562
563
564 if re.search(u"([^\u0621-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word):
565 return False;
566 if re.match(u"([\d])+",word):
567 return False;
568 return True;
569
571 """ Checks for an valid Arabic word.
572 An Arabic word
573 @param word: input word
574 @type word: unicode
575 @return: True if all charaters are in Arabic block
576 @rtype: Boolean
577 """
578 pass;
579
580
581
583 """Strip Harakat from arabic word except Shadda.
584 The striped marks are :
585 - FATHA, DAMMA, KASRA
586 - SUKUN
587 - FATHATAN, DAMMATAN, KASRATAN, , , .
588 Example:
589 >>> text=u"الْعَرَبِيّةُ"
590 >>> stripTashkeel(text)
591 العربيّة
592
593 @param text: arabic text.
594 @type text: unicode.
595 @return: return a striped text.
596 @rtype: unicode.
597 """
598 return re.sub(self.HARAKAT_pattern,u'',text)
599
601 """Strip vowels from a text, include Shadda.
602 The striped marks are :
603 - FATHA, DAMMA, KASRA
604 - SUKUN
605 - SHADDA
606 - FATHATAN, DAMMATAN, KASRATAN, , , .
607 Example:
608 >>> text=u"الْعَرَبِيّةُ"
609 >>> stripTashkeel(text)
610 العربية
611
612 @param text: arabic text.
613 @type text: unicode.
614 @return: return a striped text.
615 @rtype: unicode.
616 """
617 return re.sub(self.TASHKEEL_pattern,'',text);
618
620 """
621 Strip tatweel from a text and return a result text.
622
623 Example:
624 >>> text=u"العـــــربية"
625 >>> stripTatweel(text)
626 العربية
627
628 @param text: arabic text.
629 @type text: unicode.
630 @return: return a striped text.
631 @rtype: unicode.
632 """
633 return re.sub(self.TATWEEL,'',word);
634
636 """Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text.
637 Some systems present lamAlef ligature as a single letter, this function convert it into two letters,
638 The converted letters into LAM and ALEF are :
639 - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
640
641 Example:
642 >>> text=u"لانها لالء الاسلام"
643 >>> normalize_lamalef(text)
644 لانها لالئ الاسلام
645
646 @param text: arabic text.
647 @type text: unicode.
648 @return: return a converted text.
649 @rtype: unicode.
650 """
651 return self.LIGUATURES_pattern.sub(u'%s%s'%(self.LAM,self.ALEF), text)
652
654 """return True if the given word have the same or the partial vocalisation like the pattern vocalized
655
656 @param word: arabic word, full/partial vocalized.
657 @type word: unicode.
658 @param vocalized: arabic full vocalized word.
659 @type vocalized: unicode.
660 @return: True if vocalized.
661 @rtype: unicode.
662 """
663 if not self.isVocalized(vocalized) or not self.isVocalized(word):
664 if self.isVocalized(vocalized):
665 vocalized=self.stripTashkeel(vocalized);
666 if self.isVocalized(word):
667 word=self.stripTashkeel(word);
668 if word==vocalized:
669 return True;
670 else:
671 return False;
672 else:
673 for mark in self.TASHKEEL:
674 vocalized=re.sub(u"[%s]"%mark,u"[%s]?"%mark,vocalized)
675 vocalized="^"+vocalized+"$";
676 pat=re.compile("^"+vocalized+"$");
677 if pat.match("^"+vocalized+"$",word):
678 return True;
679 else: return False;
680