metadata {

authority_id: bgnpcgn
id: 2007
language: iso-639-2:urd
source_script: Arab
destination_script: Latn
name: Romanization of Urdu (2007)
url: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/693788/ROMANIZATION_OF_URDU.pdf
creation_date: 2007
confirmation_date: 2017-11
description: |
  The following is the approved romanization system for
  deriving standard spellings of Urdu geographical names for
  Pakistan. It was jointly adopted by BGN and PCGN at the
  23rd BGN/PCGN Conference in Washington, DC, in 2007 and it
  is based on the Hunterian romanization system for Urdu,
  which has been used by the Surveys of India and Pakistan
  for romanizing Urdu geographical names for more than one
  hundred years. The BGN/PCGN system laid out below includes
  diacritical marks in order that the original script can be
  derived from the romanized form (i.e. it is reversible).
  For desk users requiring a diacritic-free form, these
  diacritics can simply be removed. In every case the same
  basic Roman-script characters are kept as are used in the
  Hunterian system. The BGN/PCGN forms have further been
  designed to harmonize with the BGN/PCGN Persian
  romanization system.
notes:
  - 1. When the vowel sign zīr ( ِ) occurs word-finally in the
    first element of a compound, it is assumed to mark the
    Persian izafat
    morpheme, and is romanized -e, not i.
  - 2. The source of almost all example names is the 1951
    Census of Pakistan, Village List, Northwest Frontier
    Province, Chitral
    State. Office of the Provincial Superintendant of Census,
    North-West Frontier Province, Peshawar.
  - 3. No examples of aspirated dental r (rh, رھ ( were found,
    though this phoneme is assumed to be part of the phonology
    of
    Urdu, and was therefore left out of Table 2.
  - 4. Note that the short vowels in the Urdu examples are not
    pointed.
  - 5. Occasionally, sequences of /z/ or /s/ plus /h/ may be
    encountered, i.e. z·h, s·h. These may be romanized with the
    Unicode
    'center dot' (U+00B7) separating the two letters, to
    distinguish them from the digraphs /zh/ and /sh/.
  - Commented tests are blocked by this issue https://github.com/interscript/interscript/issues/572
    depends on the different ways of handling ي to y or e AND و to u or o

}

tests {

# - source: بوغدِی
#   expected: Boghdī
test "پَالِير", "Pālīr"
# - source: بیزوت كَلے
#   expected: Bezot Kale
# - source: عَمَل كوٹ
#   expected: ‘Amal Koṭ
test "ثَابِر", "S̄ābir"
test "شَاه نَثَار ميلة", "Shāh Nas̄ār Mylah"
# - source: بَرجُو ميلَه
#   expected: Barjū Melah
test "چَپرِی", "Chaprī"
test "أَحمَد خَان كَلے", "Aḩmad Khān Kale"
# - source: آكَا خيل
#   expected: Ākā Khel
test "دُرَانِي", "Durānī"
test "ڈَنگِیلا", "Ḍangīlā"
test "ذَرَانِی", "Z̄arānī"
test "بُركِي", "Burkī"
test "گِیدَڑَه", "Gīdaṛah"
test "عَلِي زَائِي", "‘Alī Zā’ī"
# - source: ژوب
#   expected: Zhob
test "بِسَاتُو", "Bisātū"
test "أَحمَدِي شَامَا", "Aḩmadī Shāmā"
test "اَصَالَت كَلے", "Aşālat Kale"
test "خَضَر خَان", "Khaẕar Khān"
test "سُلْطَان", "Sulţān"
test "عَزَم سَيِّد نُور كَلے", "‘Azam Sayyid Nūr Kale"
# - source: عَلَم شير
#   expected: ‘Alam Sher
test "بغَاكِي", "Bghākī"
# - source: مُظَفَر كوٹ
#   expected: Muz̧afar Koṭ
test "حَقدَرَه", "Ḩaqdarah"
test "کَچکِینَہ", "Kachkīnah"
test "بَاگَن", "Bāgan"
test "بُلبَلَک", "Bulbalak"
test "بِلیَامِین", "Bilyāmīn"
test "نَہر", "Nahr"
# - source: جوکَالِیَاں
#   expected: Jokālīāñ
test "اَرَوْالِی", "Arawālī"
# - source: هیروشاه
#   expected: Heroshāh
test "مَہردِی", "Mahrdī"
test "بَڑھ", "Baṛh"
# - source: شِیوَاؤ
#   expected: Shīwā’o
test "یَاردَا کَلے", "Yārdā Kale"
test "بهَائِي خَان", "Bhā’ī Khān"
test "پھاشک", "Phāshk"
test "تھَلّ", "Thall"
test "پَٹھان ريَا", "Paṭhān Ryā"
test "جھِیل", "Jhīl"
test "غَزْنِي سْپِين", "Ghaznī Spīn"
test "بَادشَاه چھُم", "Bādshāh Chhum"
test "سِندھ", "Sindh"
test "ڈھَنڈ", "Ḍhanḍ"
# - source: غوزگَڑھِی
#   expected: Ghozgaṛhī
# - source: دوغَل گاکھَر
#   expected: Doghal Gākhar
test "خَان گھَڑِی", "Khān Ghaṛī"
test "غُلَامَک كَلے", "Ghulāmak Kale"
# - source: کاراخیل
#   expected: Kārākhel
test "خَپیَنگا", "Khapyangā"
test "گَندَه كَلے", "Gandah Kale"
# - source: گُلونَا ڈھيرِي
#   expected: Gulonā Ḍherī
# - source: خيرَه دِين
#   expected: Kherah Dīn
test "مَورپِتھِی", "Maurpithī"
test "درے پلارِی", "Dre Plārī"
test "آگرَہ", "Āgrah"
test "ڈَنڈَر", "Ḍanḍar"
# - source: گِیدو
#   expected: Gīdo
test "گُبازانَہ", "Gubāzānah"
# - source: اُوشو
#   expected: Ūsho
test "حَےدَر عَلِی كَلے", "Ḩaidar ‘Alī Kale"
test "تَودَہ چِینَہ", "Taudah Chīnah"
test "مُوسى خَان كَلے", "Mūsá Khān Kale"
test "مُلَّا بَاغ", "Mullā Bāgh"

}

stage {

# CHARACTERS
parallel {
  # special rules

  sub space, "", after: "\u0622\u0628\u064E\u0627\u062F" # space followed by abad is removed
  sub "\ufdf2", "Allāh" # See note 5

  # Vowels, Diphthongs, and Diacritical Marks
  sub "\u064e", "a" # َ fatha
  sub "\u064e", "", after: "\u0629" # َ fatha followed by ta' marboota
  sub "\u064e", "", after: "a" + any("h|t") # َ fatha followed by ta' marboota, handling different order of conversion

  sub "\u0652", "" # ْ sokoon
  sub "\u0659", "ê"

  sub "\u0650" + any("\u064a|\u06cc"), "ī" # ـِي kasra followed by ي
  sub "\u0650", "i" # karsra
  sub "\u06d2", "e" # ـے

  sub "\u0622", "ā" # آ
  sub "\u064e\u0627", "ā" # ـَا fatha followed by ا
  sub "\u0627", "ā" # ا
  sub boundary + "\u0627", "" # ا

  sub "\u0648", "o" # و # suspect
  sub "\u064f", "u" # ُ damma
  sub "\u064f\u0648", "ū" # ـُو damma followed by و

  sub "\u064e\u06d2", "ai" # ـے
  sub "\u064e\u0648", "au" # ـَو
  sub "\u064e\u064a\u0651", "ayy" # ـَو

  sub "\u0670", "á" # ىٰ
  sub "\u0649", "á" # ىٰ

  # shadda
  sub "\u0628\u0651", "bb" # ب
  sub "\u062a\u0651", "tt" # ت
  sub "\u062b\u0651", "thth" # ث
  sub "\u062c\u0651", "jj" # ج
  sub "\u062d\u0651", "ẖẖ" # ح
  sub "\u062e\u0651", "khkh" # خ
  sub "\u062f\u0651", "dd" # د
  sub "\u0630\u0651", "z̄z̄" # ذ
  sub "\u0631\u0651", "rr" # ر
  sub "\u0632\u0651", "zz" # ز
  sub "\u0633\u0651", "ss" # س
  sub "\u0634\u0651", "sh" # ش
  sub "\u0635\u0651", "şş" # ص
  sub "\u0636\u0651", "ḏḏ" # ض
  sub "\u0637\u0651", "ţţ" # ط
  sub "\u0638\u0651", "z̧z̧" # ظ
  sub "\u063a\u0651", "ghgh" # غ
  sub "\u0641\u0651", "ff" # ف
  sub "\u0642\u0651", "qq" # ق
  sub "\u0643\u0651", "kk" # ك
  sub "\u0644\u0651", "ll" # ل
  sub "\u0645\u0651", "mm" # م
  sub "\u0646\u0651", "nn" # ن
  sub "\u0647\u0651", "hh" # ه
  sub "\u0648\u0651", "ww" # و
  sub any("\u064a|\u06cc") + "\u0651", "yy" # ي
  sub "\u064e\u064a", "yy" # ي

  # NOTE 1
  sub "\u0650" + boundary, "-e" # ِ kasra
  sub "\u0674", "-e" # ٴ
  sub "\u0654", "-e" #  ٔ

  sub "\u0650\u064a\u0651\u064e", "īy" # ـِيَّ
  sub "\u0650\u064a", "iy", after: any("\u064e\u064f") # ـِي kasra followed by ي
  sub "\u064e\u0649", "ay" # ـَى fatha followed by ى which is ا not ي
  sub "\u064e\u0648\u0652", "aw" # ـَوْ
  sub "\u064e\u064a\u0652", "ay" # ـَيْ
  sub "\u0650\u06cc\u0651\u064e", "īy" # ـِيَّ
  sub "\u064e\u064a", "aī" # ـَي
  sub "\u064e\u06cc", "aī" # ـَي
  # - '-ye'

  # ta' marboota
  sub "\u0629", "at" # ة in the middle of the sentence
  sub "\u0629" + line_end, "ah"
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")
  sub "\u0629", "ah", before: boundary + "\u0627\u0644" + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff") + any("\u0600".."\u06ff")

  sub "\u0621", "’" # ء
  sub "\u0624", "’" # ؤ
  sub "\u0624" + boundary, "’o" # ؤ
  sub "\u0626", "’" # ئ

  sub "\u0623", "" # أ
  sub "\u0625", "" # إ
  # See note B
  sub boundary + "\u0627\u0644", "al " # ال
  # '\uFE8E' : ''  # ﺎ

  # Sun letters
  sub boundary + "\u0627\u0644\u062a" + maybe("\u0651"), "at t" # الت
  sub boundary + "\u0627\u0644\u062b" + maybe("\u0651"), "as̄ s̄" # الث
  sub boundary + "\u0627\u0644\u062f" + maybe("\u0651"), "ad d" # الد
  sub boundary + "\u0627\u0644\u0630" + maybe("\u0651"), "az̄ z̄" # الذ
  sub boundary + "\u0627\u0644\u0631" + maybe("\u0651"), "ar r" # الر
  sub boundary + "\u0627\u0644\u0632" + maybe("\u0651"), "az z" # الز
  sub boundary + "\u0627\u0644\u0633" + maybe("\u0651"), "as s" # الس
  sub boundary + "\u0627\u0644\u0634" + maybe("\u0651"), "ash sh" # الش
  sub boundary + "\u0627\u0644\u0635" + maybe("\u0651"), "aş ş" # الص
  sub boundary + "\u0627\u0644\u0636" + maybe("\u0651"), "aẕ ẕ" # الض
  sub boundary + "\u0627\u0644\u0637" + maybe("\u0651"), "aţ ţ" # الط
  sub boundary + "\u0627\u0644\u0638" + maybe("\u0651"), "az̧ z̧" # الظ
  sub boundary + "\u0627\u0644\u0644" + maybe("\u0651"), "al l" # الل
  sub boundary + "\u0627\u0644\u0646" + maybe("\u0651"), "an n" # الن

  # consonant characters

  sub "\u0628", "b" # ب
  sub "\u067E", "p" # پ
  sub "\u062a", "t" # ت
  sub "\u0679", "ṭ" # ٹ
  sub "\u062B", "s̄" # ث
  sub "\u062c", "j" # ج
  sub "\u0686", "ch" # ‫چ‬
  sub "\u062d", "ḩ" # ح
  sub "\u062e", "kh" # خ
  sub "\u062f", "d" # د
  sub "\u0688", "ḍ" # ‫ڈ
  sub "\u0630", "z̄" # ذ
  sub "\u0631", "r" # ر
  sub "\u0691", "ṛ" # ڑ
  sub "\u0632", "z" # ز
  sub "\u0698", "zh" # ‫ژ‬
  sub "\u0633", "s" # س
  sub "\u0634", "sh" # ش
  sub "\u0635", "ş" # ص
  sub "\u0636", "ẕ" # ض
  sub "\u0637", "ţ" # ط
  sub "\u0638", "z̧" # ظ
  sub "\u0639", "‘" # ع
  sub "\u063a", "gh" # غ
  sub "\u0641", "f" # ف
  sub "\u0642", "q" # ق
  sub "\u0643", "k" # ك
  sub "\u06A9", "k" # ک
  sub "\u06AF", "g" # ‫گ‬
  sub "\u0644", "l" # ل
  sub "\u0645", "m" # م
  sub "\u0646", "n" # ن
  sub "\u06BA", "ñ" # ڼ
  sub any("\u0647\u06c1\u06be"), "h" # ه
  sub "\u0648", "w" # و
  sub any("\u064a\u06cc"), "y" # ي
  # '\u0649' : 'y'  # ي
  sub "\u06D0", "ē" # ې
  sub "\u06CD", "êy" # ‫ۍ‬
}

# POSTRULES
sub any("\u0061".."\uFFFF"), upcase, before: boundary, not_before: boundary + any("‘’'-")
# don't capitalize defined article in the middle of a sentence
sub " At T", " at T" # الت
sub " As̄ S̄", " as̄ S̄" # الث
sub " Ad D", " ad D" # الد
sub " Az̄ Z̄", " az̄ Z̄" # الذ
sub " Ar R", " ar R" # الر
sub " Az Z", " az Z" # الز
sub " As S", " as S" # الس
sub " Ash Sh", " ash Sh" # الش
sub " Aş Ş", " aş Ş" # الص
sub " Aẕ Ẕ", " aẕ Ẕ" # الض
sub " Aţ Ţ", " aţ Ţ" # الط
sub " Az̧ Z̧", " az̧ Z̧" # الظ
sub " Al L", " al L" # الل
sub " An N", " an N" # الن
sub " Al ", " al " # ال

compose

}