module Normalizer
normalizer
Constants
- ALEF
Normalize “الف”
- ALEF_MADDA
- ALEF_WITH_HAMZA_ABOVE
- ALEF_WITH_HAMZA_ABOVE10
- ALEF_WITH_HAMZA_ABOVE11
- ALEF_WITH_HAMZA_ABOVE12
- ALEF_WITH_HAMZA_ABOVE13
- ALEF_WITH_HAMZA_ABOVE14
- ALEF_WITH_HAMZA_ABOVE15
- ALEF_WITH_HAMZA_ABOVE16
- ALEF_WITH_HAMZA_ABOVE17
- ALEF_WITH_HAMZA_ABOVE2
- ALEF_WITH_HAMZA_ABOVE3
- ALEF_WITH_HAMZA_ABOVE4
- ALEF_WITH_HAMZA_ABOVE5
- ALEF_WITH_HAMZA_ABOVE6
- ALEF_WITH_HAMZA_ABOVE7
- ALEF_WITH_HAMZA_ABOVE8
- ALEF_WITH_HAMZA_ABOVE9
- ALEF_WITH_HAMZA_BELOW
- ALEF_WITH_HAMZA_BELOW1
- ARABIC_ALEF_MAKSOURA
- ARABIC_KAF
Normalize “ک”
- ARABIC_YEH
Normalize “ی”
- B
Normalize “ب”
- CHARACTERS_MAPPINGS
- CHEH
Normalize “چ”
- CHEH1
- CHEH2
- CHEH3
- CHEH4
- CHEH5
- CHEH6
- DALL
Normalize “د”
- DALL1
- DALL10
- DALL11
- DALL2
- DALL3
- DALL4
- DALL5
- DALL6
- DALL7
- DALL8
- DALL9
- DAMMA
- DAMMATAN
- DIACRITICS
- EIN
Normalize “ع”
- EIN1
- EIN2
- EIN3
- EIN4
- EIN5
- EIN6
- FARSI_B
- FARSI_B1
- FARSI_B2
- FARSI_B3
- FARSI_B4
- FARSI_B5
- FARSI_GAF1
- FARSI_GAF10
- FARSI_GAF11
- FARSI_GAF2
- FARSI_GAF3
- FARSI_GAF4
- FARSI_GAF5
- FARSI_GAF6
- FARSI_GAF7
- FARSI_GAF8
- FARSI_GAF9
- FARSI_HEH
Normalize “ه”
- FARSI_HEH1
- FARSI_HEH10
Normalize “ة”
- FARSI_HEH11
- FARSI_HEH12
- FARSI_HEH13
- FARSI_HEH14
- FARSI_HEH15
- FARSI_HEH2
- FARSI_HEH3
- FARSI_HEH4
- FARSI_HEH5
- FARSI_HEH6
- FARSI_HEH7
- FARSI_HEH8
- FARSI_HEH9
- FARSI_J
- FARSI_J1
- FARSI_KEHEH
- FARSI_KEHEH1
- FARSI_KEHEH10
- FARSI_KEHEH11
- FARSI_KEHEH12
- FARSI_KEHEH13
- FARSI_KEHEH14
- FARSI_KEHEH15
- FARSI_KEHEH2
- FARSI_KEHEH3
- FARSI_KEHEH4
- FARSI_KEHEH5
- FARSI_KEHEH6
- FARSI_KEHEH7
- FARSI_KEHEH8
- FARSI_KEHEH9
- FARSI_MIM
Normalize “م”
- FARSI_MIM1
- FARSI_MIM2
- FARSI_MIM3
- FARSI_MIM4
- FARSI_MIM5
- FARSI_MIM6
- FARSI_NOON
Normalize “ن”
- FARSI_NOON1
- FARSI_NOON10
- FARSI_NOON2
- FARSI_NOON3
- FARSI_NOON4
- FARSI_NOON5
- FARSI_NOON6
- FARSI_NOON7
- FARSI_NOON8
- FARSI_NOON9
- FARSI_P
- FARSI_P1
- FARSI_P2
- FARSI_P3
- FARSI_P4
- FARSI_P5
- FARSI_SEH1
- FARSI_SEH10
- FARSI_SEH11
- FARSI_SEH12
- FARSI_SEH13
- FARSI_SEH14
- FARSI_SEH15
- FARSI_SEH2
- FARSI_SEH3
- FARSI_SEH4
- FARSI_SEH5
- FARSI_SEH6
- FARSI_SEH7
- FARSI_SEH8
- FARSI_SEH9
- FARSI_VAV
Normalize “و”
- FARSI_VAV1
- FARSI_VAV10
- FARSI_VAV11
- FARSI_VAV12
- FARSI_VAV13
- FARSI_VAV14
- FARSI_VAV15
- FARSI_VAV16
- FARSI_VAV17
- FARSI_VAV18
- FARSI_VAV19
- FARSI_VAV2
- FARSI_VAV20
- FARSI_VAV21
- FARSI_VAV22
- FARSI_VAV23
- FARSI_VAV24
- FARSI_VAV3
- FARSI_VAV4
- FARSI_VAV5
- FARSI_VAV6
- FARSI_VAV7
- FARSI_VAV8
- FARSI_VAV9
- FARSI_YEH
- FARSI_YEH1
- FARSI_YEH10
- FARSI_YEH11
- FARSI_YEH12
- FARSI_YEH13
- FARSI_YEH14
- FARSI_YEH15
- FARSI_YEH16
- FARSI_YEH17
- FARSI_YEH18
- FARSI_YEH19
- FARSI_YEH2
- FARSI_YEH20
- FARSI_YEH21
- FARSI_YEH22
- FARSI_YEH23
- FARSI_YEH24
- FARSI_YEH25
- FARSI_YEH26
- FARSI_YEH27
- FARSI_YEH3
- FARSI_YEH4
- FARSI_YEH5
- FARSI_YEH6
- FARSI_YEH7
- FARSI_YEH8
- FARSI_YEH9
- FATHA
- FATHATAN
- FEH
Normalize “ف”
- FEH1
- FEH2
- FEH3
- FEH4
- FEH5
- FEH6
- FEH7
- FEH8
- FEH9
- GAF
Normalize “گ”
- GHAF
Normalize “ق”
- GHAF1
- GHAF2
- GHAF3
- GHAF4
- GHAF5
- GHAF6
- GHAF7
- GHEIN
Normalize “غ”
- GHEIN1
- GHEIN2
- GHEIN3
- GHEIN4
- GHEIN5
- GHEIN6
- GHEIN7
- HEH_JIMI
Normalize “ح”
- HEH_JIMI1
- HEH_JIMI2
- HEH_JIMI3
- HEH_JIMI4
- HEH_JIMI5
- J
Normalize “ژ”
- JIM
Normalize “ج”
- JIM1
- JIM2
- JIM3
- JIM4
- JIM5
- KASRA
- KASRATAN
- KHEH
Normalize “خ”
- KHEH2
- KHEH3
- KHEH4
- KHEH5
- KHEH6
- KHEH7
- LAM
Normalize “ل”
- LAM1
- LAM2
- LAM3
- LAM4
- LAM5
- LAM6
- LAM7
- LAM8
- LAM9
- Marks1
- Marks2
- Marks4
- Marks5
- Marks6
- Marks7
- P
Normalize “پ”
- Percent_Marks
- Percent_Marks1
- Q_Marks
- Q_Marks1
- REH
Normalize “ر”
- REH1
- REH10
- REH11
- REH12
- REH2
- REH3
- REH4
- REH5
- REH6
- REH7
- REH8
- REH9
- SAD
Normalize “ص”
- SAD1
- SAD2
- SAD3
- SAD4
- SAD5
- SEH
Normalize “ث”
- SHADDA
- SHIN
Normalize “ش”
- SHIN1
- SHIN2
- SHIN3
- SHIN4
- SHIN5
- SHIN6
- SHIN7
- SIN1
Normalize “س”
- SIN2
- SIN3
- SIN4
- SIN5
- SIN6
- SIN7
- SUKUN
- TA
Normalize “ط”
- TA1
- TA2
- TA3
- TA4
- TATWIL
- ZA
Normalize “ظ”
- ZA1
- ZA2
- ZA3
- ZA4
- ZA5
- ZAD
Normalize “ض”
- ZAD1
- ZAD2
- ZAD3
- ZAD4
- ZAD5
- ZALL
Normalize “ذ”
- ZALL1
- ZEH
Normalize “ز”
- ZEH1
- ZEH2
- ZEH3
- ZEH4
Public Instance Methods
normalize()
click to toggle source
# File lib/Normalizer.rb, line 768 def normalize regexpersian map_charachters remove_diacritics word end
Private Instance Methods
get_file_as_string(filename)
click to toggle source
# File lib/Normalizer.rb, line 837 def get_file_as_string(filename) string = '' file = File.open(filename, "r") file.each_line do |line| string += line end string end
map_charachters()
click to toggle source
# File lib/Normalizer.rb, line 778 def map_charachters rules = filter_rules(CHARACTERS_MAPPINGS) return if rules.empty? @word = word.gsub(/[#{rules.keys.join}]/, rules) end
regexpersian()
click to toggle source
# File lib/Normalizer.rb, line 792 def regexpersian #finds = data.scan(/"(.*)"/) @word = word.gsub(/ می /, " می\u200c") @word = word.gsub(/ نمی /, " نمی\u200c") @word = word.gsub(/ ها /, "\u200cها ") @word = word.gsub(/ های /, "\u200cهای ") @word = word.gsub(/ تر /, "\u200cتر ") @word = word.gsub(/ تری /, "\u200cتری ") @word = word.gsub(/ ات /, "\u200cات ") @word = word.gsub(/ اش /, "\u200cاش ") @word = word.gsub(/ ام /, "\u200cام ") @word = word.gsub(/ ای /, "\u200cای ") @word = word.gsub(/1/, "۱") @word = word.gsub(/2/, "۲") @word = word.gsub(/3/, "۳") @word = word.gsub(/4/, "۴") @word = word.gsub(/5/, "۵") @word = word.gsub(/6/, "۶") @word = word.gsub(/7/, "۷") @word = word.gsub(/8/, "۸") @word = word.gsub(/9/, "۹") @word = word.gsub(/0/, "۰") @word = word.gsub(/;/, "؛") @word = word.gsub(/%/, "٪") @word = word.gsub(/'/, "\'") @word = word.gsub(/ اند /, "\u200cاند ") @word = word.gsub(/ ایم /, "\u200cایم ") @word = word.gsub(/ اید /, "\u200cاید ") @word = word.gsub(/"/, "\"") @word = word.gsub(/٬/, "\،") @word = word.gsub(/–/, "\-") @word = word.gsub(/˗/, "\-") @word = word.gsub(/־/, "\-") @word = word.gsub(/'/, "\-") @word = word.gsub(/━/, "\-") @word = word.gsub(/—/, "\-") @word = word.gsub(/—/, "\-") @word = word.gsub(/_/, "\-") @word = word.gsub(/┅/, "\…") @word = word.gsub(/┄/, "\…") @word = word.gsub(/┈/, "\…") @word = word.gsub(/\n/, " ") end
remove_diacritics()
click to toggle source
# File lib/Normalizer.rb, line 785 def remove_diacritics rules = filter_rules(DIACRITICS) return if rules.empty? @word = word.gsub(/[#{rules.join}]/, '') end