module Normalizer

normalizer

Constants

ALEF

Normalize “الف”

ALEF_MADDA
ALEF_WITH_HAMZA_ABOVE
ALEF_WITH_HAMZA_ABOVE10
ALEF_WITH_HAMZA_ABOVE11
ALEF_WITH_HAMZA_ABOVE12
ALEF_WITH_HAMZA_ABOVE13
ALEF_WITH_HAMZA_ABOVE14
ALEF_WITH_HAMZA_ABOVE15
ALEF_WITH_HAMZA_ABOVE16
ALEF_WITH_HAMZA_ABOVE17
ALEF_WITH_HAMZA_ABOVE2
ALEF_WITH_HAMZA_ABOVE3
ALEF_WITH_HAMZA_ABOVE4
ALEF_WITH_HAMZA_ABOVE5
ALEF_WITH_HAMZA_ABOVE6
ALEF_WITH_HAMZA_ABOVE7
ALEF_WITH_HAMZA_ABOVE8
ALEF_WITH_HAMZA_ABOVE9
ALEF_WITH_HAMZA_BELOW
ALEF_WITH_HAMZA_BELOW1
ARABIC_ALEF_MAKSOURA
ARABIC_KAF

Normalize “ک”

ARABIC_YEH

Normalize “ی”

B

Normalize “ب”

CHARACTERS_MAPPINGS
CHEH

Normalize “چ”

CHEH1
CHEH2
CHEH3
CHEH4
CHEH5
CHEH6
DALL

Normalize “د”

DALL1
DALL10
DALL11
DALL2
DALL3
DALL4
DALL5
DALL6
DALL7
DALL8
DALL9
DAMMA
DAMMATAN
DIACRITICS
EIN

Normalize “ع”

EIN1
EIN2
EIN3
EIN4
EIN5
EIN6
FARSI_B
FARSI_B1
FARSI_B2
FARSI_B3
FARSI_B4
FARSI_B5
FARSI_GAF1
FARSI_GAF10
FARSI_GAF11
FARSI_GAF2
FARSI_GAF3
FARSI_GAF4
FARSI_GAF5
FARSI_GAF6
FARSI_GAF7
FARSI_GAF8
FARSI_GAF9
FARSI_HEH

Normalize “ه”

FARSI_HEH1
FARSI_HEH10

Normalize “ة”

FARSI_HEH11
FARSI_HEH12
FARSI_HEH13
FARSI_HEH14
FARSI_HEH15
FARSI_HEH2
FARSI_HEH3
FARSI_HEH4
FARSI_HEH5
FARSI_HEH6
FARSI_HEH7
FARSI_HEH8
FARSI_HEH9
FARSI_J
FARSI_J1
FARSI_KEHEH
FARSI_KEHEH1
FARSI_KEHEH10
FARSI_KEHEH11
FARSI_KEHEH12
FARSI_KEHEH13
FARSI_KEHEH14
FARSI_KEHEH15
FARSI_KEHEH2
FARSI_KEHEH3
FARSI_KEHEH4
FARSI_KEHEH5
FARSI_KEHEH6
FARSI_KEHEH7
FARSI_KEHEH8
FARSI_KEHEH9
FARSI_MIM

Normalize “م”

FARSI_MIM1
FARSI_MIM2
FARSI_MIM3
FARSI_MIM4
FARSI_MIM5
FARSI_MIM6
FARSI_NOON

Normalize “ن”

FARSI_NOON1
FARSI_NOON10
FARSI_NOON2
FARSI_NOON3
FARSI_NOON4
FARSI_NOON5
FARSI_NOON6
FARSI_NOON7
FARSI_NOON8
FARSI_NOON9
FARSI_P
FARSI_P1
FARSI_P2
FARSI_P3
FARSI_P4
FARSI_P5
FARSI_SEH1
FARSI_SEH10
FARSI_SEH11
FARSI_SEH12
FARSI_SEH13
FARSI_SEH14
FARSI_SEH15
FARSI_SEH2
FARSI_SEH3
FARSI_SEH4
FARSI_SEH5
FARSI_SEH6
FARSI_SEH7
FARSI_SEH8
FARSI_SEH9
FARSI_VAV

Normalize “و”

FARSI_VAV1
FARSI_VAV10
FARSI_VAV11
FARSI_VAV12
FARSI_VAV13
FARSI_VAV14
FARSI_VAV15
FARSI_VAV16
FARSI_VAV17
FARSI_VAV18
FARSI_VAV19
FARSI_VAV2
FARSI_VAV20
FARSI_VAV21
FARSI_VAV22
FARSI_VAV23
FARSI_VAV24
FARSI_VAV3
FARSI_VAV4
FARSI_VAV5
FARSI_VAV6
FARSI_VAV7
FARSI_VAV8
FARSI_VAV9
FARSI_YEH
FARSI_YEH1
FARSI_YEH10
FARSI_YEH11
FARSI_YEH12
FARSI_YEH13
FARSI_YEH14
FARSI_YEH15
FARSI_YEH16
FARSI_YEH17
FARSI_YEH18
FARSI_YEH19
FARSI_YEH2
FARSI_YEH20
FARSI_YEH21
FARSI_YEH22
FARSI_YEH23
FARSI_YEH24
FARSI_YEH25
FARSI_YEH26
FARSI_YEH27
FARSI_YEH3
FARSI_YEH4
FARSI_YEH5
FARSI_YEH6
FARSI_YEH7
FARSI_YEH8
FARSI_YEH9
FATHA
FATHATAN
FEH

Normalize “ف”

FEH1
FEH2
FEH3
FEH4
FEH5
FEH6
FEH7
FEH8
FEH9
GAF

Normalize “گ”

GHAF

Normalize “ق”

GHAF1
GHAF2
GHAF3
GHAF4
GHAF5
GHAF6
GHAF7
GHEIN

Normalize “غ”

GHEIN1
GHEIN2
GHEIN3
GHEIN4
GHEIN5
GHEIN6
GHEIN7
HEH_JIMI

Normalize “ح”

HEH_JIMI1
HEH_JIMI2
HEH_JIMI3
HEH_JIMI4
HEH_JIMI5
J

Normalize “ژ”

JIM

Normalize “ج”

JIM1
JIM2
JIM3
JIM4
JIM5
KASRA
KASRATAN
KHEH

Normalize “خ”

KHEH2
KHEH3
KHEH4
KHEH5
KHEH6
KHEH7
LAM

Normalize “ل”

LAM1
LAM2
LAM3
LAM4
LAM5
LAM6
LAM7
LAM8
LAM9
Marks1
Marks2
Marks4
Marks5
Marks6
Marks7
P

Normalize “پ”

Percent_Marks
Percent_Marks1
Q_Marks
Q_Marks1
REH

Normalize “ر”

REH1
REH10
REH11
REH12
REH2
REH3
REH4
REH5
REH6
REH7
REH8
REH9
SAD

Normalize “ص”

SAD1
SAD2
SAD3
SAD4
SAD5
SEH

Normalize “ث”

SHADDA
SHIN

Normalize “ش”

SHIN1
SHIN2
SHIN3
SHIN4
SHIN5
SHIN6
SHIN7
SIN1

Normalize “س”

SIN2
SIN3
SIN4
SIN5
SIN6
SIN7
SUKUN
TA

Normalize “ط”

TA1
TA2
TA3
TA4
TATWIL
ZA

Normalize “ظ”

ZA1
ZA2
ZA3
ZA4
ZA5
ZAD

Normalize “ض”

ZAD1
ZAD2
ZAD3
ZAD4
ZAD5
ZALL

Normalize “ذ”

ZALL1
ZEH

Normalize “ز”

ZEH1
ZEH2
ZEH3
ZEH4

Public Instance Methods

normalize() click to toggle source
# File lib/Normalizer.rb, line 768
def normalize
  regexpersian
  map_charachters
  remove_diacritics
  word

end

Private Instance Methods

get_file_as_string(filename) click to toggle source
# File lib/Normalizer.rb, line 837
def get_file_as_string(filename)
  string = ''
  file = File.open(filename, "r")
  file.each_line do |line|
    string += line
  end
  string
end
map_charachters() click to toggle source
# File lib/Normalizer.rb, line 778
def map_charachters
  rules = filter_rules(CHARACTERS_MAPPINGS)
  return if rules.empty?

  @word = word.gsub(/[#{rules.keys.join}]/, rules)
end
regexpersian() click to toggle source
# File lib/Normalizer.rb, line 792
def regexpersian
  #finds = data.scan(/"(.*)"/)
  @word = word.gsub(/ می /, " می\u200c")
  @word = word.gsub(/ نمی /, " نمی\u200c")
  @word = word.gsub(/ ها /, "\u200cها ")
  @word = word.gsub(/ های /, "\u200cهای ")
  @word = word.gsub(/ تر /, "\u200cتر ")
  @word = word.gsub(/ تری /, "\u200cتری ")
  @word = word.gsub(/ ات /, "\u200cات ")
  @word = word.gsub(/ اش /, "\u200cاش ")
  @word = word.gsub(/ ام /, "\u200cام ")
  @word = word.gsub(/ ای /, "\u200cای ")
  @word = word.gsub(/1/, "۱")
  @word = word.gsub(/2/, "۲")
  @word = word.gsub(/3/, "۳")
  @word = word.gsub(/4/, "۴")
  @word = word.gsub(/5/, "۵")
  @word = word.gsub(/6/, "۶")
  @word = word.gsub(/7/, "۷")
  @word = word.gsub(/8/, "۸")
  @word = word.gsub(/9/, "۹")
  @word = word.gsub(/0/, "۰")
  @word = word.gsub(/;/, "؛")
  @word = word.gsub(/%/, "٪")
  @word = word.gsub(/'/, "\'")
  @word = word.gsub(/ اند /, "\u200cاند ")
  @word = word.gsub(/ ایم /, "\u200cایم ")
  @word = word.gsub(/ اید /, "\u200cاید ")
  @word = word.gsub(/"/, "\"")
  @word = word.gsub(/٬/, "\،")
  @word = word.gsub(/–/, "\-")
  @word = word.gsub(/˗/, "\-")
  @word = word.gsub(/־/, "\-")
  @word = word.gsub(/'/, "\-")
  @word = word.gsub(/━/, "\-")
  @word = word.gsub(/—/, "\-")
  @word = word.gsub(/—/, "\-")
  @word = word.gsub(/_/, "\-")
  @word = word.gsub(/┅/, "\…")
  @word = word.gsub(/┄/, "\…")
  @word = word.gsub(/┈/, "\…")
  @word = word.gsub(/\n/, " ")

end
remove_diacritics() click to toggle source
# File lib/Normalizer.rb, line 785
def remove_diacritics
  rules = filter_rules(DIACRITICS)
  return if rules.empty?

  @word = word.gsub(/[#{rules.join}]/, '')
end