metadata {

authority_id: un
id: 2017
language: iso-639-2:ara
source_script: Arab
destination_script: Latn
name: Romanization of Arabic (2017)
url: https://www.eki.ee/wgrs/rom1_ur.htm
creation_date: 2017
confirmation_date: 2018-06
description: |
  The United Nations recommended system was approved in 1972 (
  II/11) and amended in 1977 (III/12), based on a report
  prepared by D. N. Sharma. The tables and their corrections
  were published in volume II of the conference reports1,2.

  There is no evidence of the use of the system either in
  Pakistan, India or in international cartographic products.
  Instead, in Pakistan the Hunterian system is officially
  used3. The resolutions III/12 (1977) and IV/17 (1982)
  recommended association, inter alia, with Pakistan, in
  carrying out further studies on the system.

  Urdu (Urdū) uses the Perso-Arabic script which is written
  from right to left. In the script vowel points are usually
  omitted which makes it difficult to obtain uniform
  romanizations. Some of the Arabic consonants are
  undifferentiated in romanization which means that the
  system is not fully reversible.
notes:
  - A If preceded by short a, it is romanized ‘ā, e.g. مَعمُل M‘āmul.
  - B When و is imperceptible, e.g. in a few words of Persian origin when preceded by خ (ḳh).
  - C Word-finally after a short vowel.
  - D Marks aspiration of consonants.
  - E The character ے is used only word-finally.

}

tests {

test "بوغدِی", "Bvghdī"
test "مَعمُل", "M‘āmul"
test "پَالِير", "Pālīr"
test "بیزوت كَلے", "Byzvt Kale"
test "عَمَل كوٹ", "‘Amal Kvṭ"
test "ثَابِر", "Sābir"
test "شَاه نَثَار ميلة", "Shāh Nasār Mylah"
test "چَپرِی", "Chaprī"
test "أَحمَد خَان كَلے", "Ahmad Ḳhān Kale"
test "دُرَانِي", "Durānī"
test "ڈَنگِیلا", "Ḍangīlā"
test "ذَرَانِی", "Zarānī"
test "بُركِي", "Burkī"
test "گِیدَڑَه", "Gīdaṙah"
test "عَلِي زَائِي", "‘Alī Zā-ī"
test "ژوب", "Ỵvb"
test "بِسَاتُو", "Bisātū"
test "أَحمَدِي شَامَا", "Ahmadī Shāmā"
test "اَصَالَت كَلے", "Asālat Kale"
test "خَضَر خَان", "Ḳhazar Ḳhān"
test "سُلْطَان", "Sultān"
test "عَزَم سَيِّد نُور كَلے", "‘Azam Sayyid Nūr Kale"
test "بغَاكِي", "Bghākī"
test "حَقدَرَه", "Haqdarah"
test "کَچکِینَہ", "Kachkīnaḥ"
test "بَاگَن", "Bāgan"
test "بُلبَلَک", "Bulbalak"
test "بِلیَامِین", "Bilyāmīn"
test "نَہر", "Nahr"
test "اَرَوْالِی", "Arawālī"
test "مَہردِی", "Mahrdī"
test "بَڑھ", "Baṙh"
test "یَاردَا کَلے", "Yārdā Kale"
test "بهَائِي خَان", "Bhā-ī Ḳhān"
test "پھاشک", "Phāshk"
test "تھَلّ", "Thall"
test "پَٹھان ريَا", "Paṭhān Ryā"
test "جھِیل", "Jhīl"
test "غَزْنِي سْپِين", "Ghaznī Spīn"
test "بَادشَاه چھُم", "Bādshāh Chhum"
test "سِندھ", "Sindh"
test "ڈھَنڈ", "Ḍhanḍ"
test "خَان گھَڑِی", "Ḳhān Ghaṙī"
test "غُلَامَک كَلے", "Ghulāmak Kale"
test "خَپیَنگا", "Ḳhapyangā"
test "گَندَه كَلے", "Gandah Kale"
test "مَورپِتھِی", "Maurpithī"
test "درے پلارِی", "Dre Plārī"
test "آگرَہ", "Āgraḥ"
test "ڈَنڈَر", "Ḍanḍar"
test "گُبازانَہ", "Gubāzānaḥ"
test "حَےدَر عَلِی كَلے", "Haidar ‘Alī Kale"
test "تَودَہ چِینَہ", "Taudaḥ Chīnaḥ"
test "مُوسى خَان كَلے", "Mūsá Ḳhān Kale"
test "مُلَّا بَاغ", "Mullā Bāgh"

}

stage {

# Somehow, this should execute first to pass the test.
sub any("\u064a\u06cc") + "\u0651", "yy" # ي

# CHARACTERS
parallel {
  # special rules

  sub space, "", after: "\u0622\u0628\u064E\u0627\u062F" # space followed by abad is removed
  sub "\ufdf2", "Allāh" # See note 5

  # Vowels, Diphthongs, and Diacritical Marks
  sub "\u064e", "a" # َ fatha
  sub "\u064e\u0627", "ā" # ـَا fatha followed by ا
  sub "\u0627", "ā" # ا
  sub "\u0649\u0670", "ā" # ىٰ
  sub "\u06D2\u0670", "ā" # ےٰ
  sub "\u0622", "ā" # آ
  sub boundary + "\u0627", "" # ا
  sub "\u064e", "", after: "\u0629" # َ fatha followed by ta' marboota
  sub "\u064e", "", after: "a" + any("ht") # َ fatha followed by ta' marboota, handling different order of conversion

  sub "\u0652", "" # ْ sokoon
  sub "\u0659", "ê"

  sub "\u0650", "i" # karsra
  sub "\u0650" + any("\u064a\u06cc"), "ī" # ـِي kasra followed by ي
  sub "\u0650\u06d2\u0652", "e" # ـے
  sub "\u0650\u06d2", "e" # ـے
  sub "\u06d2", "e" # ـے

  sub "\u064f", "u" # ُ damma
  sub "\u064f\u0648", "ū" # ـُو damma followed by و
  sub "\u064f\u0648\u0652", "o" # ـَوْ

  sub "\u064e\u06d2", "ai" # ـے
  sub "\u064e\u0648", "au" # ـَو
  sub "\u0670", "á" # ىٰ
  sub "\u0649", "á" # ىٰ

  # shadda
  sub "\u0628\u0651", "bb" # ب
  sub "\u062a\u0651", "tt" # ت
  sub "\u062b\u0651", "ss" # ث
  sub "\u062c\u0651", "jj" # ج
  sub "\u062d\u0651", "hh" # ح
  sub "\u062e\u0651", "ḳhḳh" # خ
  sub "\u062f\u0651", "dd" # د
  sub "\u0630\u0651", "zz" # ذ
  sub "\u0631\u0651", "rr" # ر
  sub "\u0632\u0651", "zz" # ز
  sub "\u0633\u0651", "ss" # س
  sub "\u0634\u0651", "sh" # ش
  sub "\u0635\u0651", "ss" # ص
  sub "\u0636\u0651", "ḏḏ" # ض
  sub "\u0637\u0651", "tt" # ط
  sub "\u0638\u0651", "zz" # ظ
  sub "\u063a\u0651", "ghgh" # غ
  sub "\u0641\u0651", "ff" # ف
  sub "\u0642\u0651", "qq" # ق
  sub "\u0643\u0651", "kk" # ك
  sub "\u0644\u0651", "ll" # ل
  sub "\u0645\u0651", "mm" # م
  sub "\u0646\u0651", "nn" # ن
  sub "\u0647\u0651", "hh" # ه
  sub "\u0648\u0651", "vv" # و

  # NOTE 1
  sub "\u0650" + boundary, "-e" # ِ kasra
  sub "\u0674", "-e" # ٴ
  sub "\u0654", "-e" #  ٔ

  sub "\u0650\u064a\u0651\u064e", "īy" # ـِيَّ
  sub "\u0650\u064a", "iy", after: any(["\u064e", "u064f"]) # ـِي kasra followed by ي
  sub "\u064e\u0649", "ay" # ـَى fatha followed by ى which is ا not ي
  sub "\u064e\u0648\u0652", "aw" # ـَوْ
  sub "\u064e\u064a\u0652", "ay" # ـَيْ
  sub "\u0650\u06cc\u0651\u064e", "īy" # ـِيَّ
  sub "\u064e\u064a", "aī" # ـَي
  sub "\u064e\u06cc", "aī" # ـَي
  # - '-ye'

  # TODO: compress this
  # ta' marboota
  sub "\u0629", "at" # ة in the middle of the sentence
  sub "\u0629" + line_end, "ah"
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")

  sub "\u0621", "-" # ء
  sub "\u0624", "-" # ؤ
  sub "\u0626", "-" # ئ

  sub "\u0623", "" # أ
  sub "\u0625", "" # إ
  # See note B
  sub boundary + "\u0627\u0644", "al " # ال
  # '\uFE8E' : ''  # ﺎ

  # Sun letters
  sub boundary + "\u0627\u0644\u062a" + maybe("\u0651"), "at t" # الت
  sub boundary + "\u0627\u0644\u062b" + maybe("\u0651"), "as s" # الث
  sub boundary + "\u0627\u0644\u062f" + maybe("\u0651"), "ad d" # الد
  sub boundary + "\u0627\u0644\u0630" + maybe("\u0651"), "az z" # الذ
  sub boundary + "\u0627\u0644\u0631" + maybe("\u0651"), "ar r" # الر
  sub boundary + "\u0627\u0644\u0632" + maybe("\u0651"), "az z" # الز
  sub boundary + "\u0627\u0644\u0633" + maybe("\u0651"), "as s" # الس
  sub boundary + "\u0627\u0644\u0634" + maybe("\u0651"), "ash sh" # الش
  sub boundary + "\u0627\u0644\u0635" + maybe("\u0651"), "as s" # الص
  sub boundary + "\u0627\u0644\u0636" + maybe("\u0651"), "az z" # الض
  sub boundary + "\u0627\u0644\u0637" + maybe("\u0651"), "at t" # الط
  sub boundary + "\u0627\u0644\u0638" + maybe("\u0651"), "az z" # الظ
  sub boundary + "\u0627\u0644\u0644" + maybe("\u0651"), "al l" # الل
  sub boundary + "\u0627\u0644\u0646" + maybe("\u0651"), "an n" # الن

  # consonant characters

  sub "\u0628", "b" # ب
  sub "\u067E", "p" # پ
  sub "\u062a", "t" # ت
  sub "\u0679", "ṭ" # ٹ
  sub "\u062B", "s" # ث
  sub "\u062c", "j" # ج
  sub "\u0686", "ch" # ‫چ‬
  sub "\u062d", "h" # ح
  sub "\u062e", "ḳh" # خ
  sub "\u062f", "d" # د
  sub "\u0688", "ḍ" # ‫ڈ
  sub "\u0630", "z" # ذ
  sub "\u0631", "r" # ر
  sub "\u0691", "ṙ" # ڑ
  sub "\u0632", "z" # ز
  sub "\u0698", "ỵ" # ‫ژ‬
  sub "\u0633", "s" # س
  sub "\u0634", "sh" # ش
  sub "\u0635", "s" # ص
  sub "\u0636", "z" # ض
  sub "\u0637", "t" # ط
  sub "\u0638", "z" # ظ
  sub "\u0639", "‘" # ع
  sub "\u064e\u0639", "‘ā" # ع NOTE A
  sub "\u063a", "gh" # غ
  sub "\u0641", "f" # ف
  sub "\u0642", "q" # ق
  sub "\u0643", "k" # ك
  sub "\u06A9", "k" # ک
  sub "\u06AF", "g" # ‫گ‬
  sub "\u0644", "l" # ل
  sub "\u0645", "m" # م
  sub any("\u06BA\u0646"), "n" # ن, ں
  sub any("\ufba9\u06c1"), "h" # ہ , ﮩ
  sub any("\ufba9\u06c1") + boundary, "ḥ", before: any("\u064e\u0650\u064f") # ہ , ﮩ NOTE C
  sub any("\u0647\u06be"), "h" # ه, ھ
  sub "\u0648", "v" # و
  sub "\u0648", "ẉ", before: "\u062e" # و NOTE B
  sub any("\u064a|\u06cc"), "y" # ي
  # '\u0649' : 'y'  # ي
  sub "\u06D0", "ē" # ې
  sub "\u06CD", "êy" # ‫ۍ‬
}

# POSTRULES
sub any("\u0061".."\uFFFF"), upcase, before: boundary, not_before: boundary + any("‘’'-")
# don't capitalize defined article in the middle of a sentence
sub " At T", " at T" # الت
sub " As̄ S̄", " as̄ S̄" # الث
sub " Ad D", " ad D" # الد
sub " Az Z", " az Z" # الذ
sub " Ar R", " ar R" # الر
sub " Az Z", " az Z" # الز
sub " As S", " as S" # الس
sub " Ash Sh", " ash Sh" # الش
sub " As S", " as S" # الص
sub " Az Z", " az Z" # الض
sub " At T", " at T" # الط
sub " Az Z", " az Z" # الظ
sub " Al L", " al L" # الل
sub " An N", " an N" # الن
sub " Al ", " al " # ال

}