;============================================================================================ ;============================================================================================ ;ENGLISH NORMALIZATION GRAMMAR ;Documentation available at http://www.unlweb.net/wiki/normalization ;============================================================================================ ;============================================================================================ (" ")(" "):=(" "); ;============================================================================================ ;I. CONTRACTIONS ;Only those that are not ambiguous ;============================================================================================ ("ain't"):=("are not"); ("I'm"):=("I am"); ("how'd'y"):=("how do you"); ("how'd"):=("how did"); ("ma'am"):=("madam"); ("shan't"):=("shall not"); ("y'all"):=("you all"); ("y'all'd've"):=("you all would have"); ("'ll"):=(" will"); I will ("n't"):=(" not"); aren't ("'t"):=(" not"); can't ("'re"):=(" are"); you're ("'ve"):=(" have"); you've ;============================================================================================ ;II. ABBREVIATIONS ;Only those involving a period, in order to avoid problems with segmentation ;============================================================================================ (" ",%a)("/[A-Z]/",%b)(".",%c):=(%a)(%b); John H. Smith > John H Smith ({SHEAD|" "})("A.B. "):=()("AB "); ({SHEAD|" "})("A.D. "):=()("AD "); ({SHEAD|" "})("A.M. "):=()("AM "); ({SHEAD|" "})("abbr. "):=()("abbreviation "); ({SHEAD|" "})("Acad. "):=()("Academy "); ({SHEAD|" "})("alt. "):=()("altitude "); ({SHEAD|" "})("apt. "):=()("apartment "); ({SHEAD|" "})("Assn. "):=()("Association "); ({SHEAD|" "})("at. no. "):=()("atomic number "); ({SHEAD|" "})("at. wt. "):=()("atomic weight "); ({SHEAD|" "})("Aug. "):=()("August "); ({SHEAD|" "})("Ave. "):=()("Avenue "); ({SHEAD|" "})("b. "):=()("born in "); ({SHEAD|" "})("B.A. "):=()("BA "); ({SHEAD|" "})("B.C. "):=()("BC "); ({SHEAD|" "})("b.p. "):=()("boiling point "); ({SHEAD|" "})("B.S. "):=()("BS "); ({SHEAD|" "})("Blvd. "):=()("Boulevard "); ({SHEAD|" "})("c. "):=()("circa "); ({SHEAD|" "})("ca. "):=()("circa "); ({SHEAD|" "})("Capt. "):=()("Captain "); ({SHEAD|" "})("cent. "):=()("century "); ({SHEAD|" "})("co. "):=()("county "); ({SHEAD|" "})("Col. "):=()("Colonel "); ({SHEAD|" "})("Comdr. "):=()("Commander "); ({SHEAD|" "})("Corp. "):=()("Corporation "); ({SHEAD|" "})("Cpl. "):=()("Corporal "); ({SHEAD|" "})("Ct. "):=()("Court "); ({SHEAD|" "})("ctr. "):=()("center "); ({SHEAD|" "})("d. "):=()("died in "); ({SHEAD|" "})("D.C. "):=()("District of Columbia "); ({SHEAD|" "})("Dec. "):=()("December "); ({SHEAD|" "})("dept. "):=()("department "); ({SHEAD|" "})("dist. "):=()("district "); ({SHEAD|" "})("div. "):=()("division "); ({SHEAD|" "})("Dr. "):=()("doctor "); ({SHEAD|" "})("e.g. "):=()("for example "); ({SHEAD|" "})("ed. "):=()("edition "); ({SHEAD|" "})("est. "):=()("established; estimated "); ({SHEAD|" "})("et al. "):=()("and other) "); ({SHEAD|" "})("etc. "):=()("and so on "); ({SHEAD|" "})("Feb. "):=()("February "); ({SHEAD|" "})("fl. "):=()("flourished "); ({SHEAD|" "})("ft. "):=()("foot "); ({SHEAD|" "})("Ft. "):=()("Fort "); ({SHEAD|" "})("gal. "):=()("gallon "); ({SHEAD|" "})("Gen. "):=()("General "); ({SHEAD|" "})("Gov. "):=()("governor "); ({SHEAD|" "})("grad. "):=()("graduated at "); ({SHEAD|" "})("Hon. "):=()("the Honorable "); ({SHEAD|" "})("hwy. "):=()("highway "); ({SHEAD|" "})("i.e. "):=()("that is "); ({SHEAD|" "})("in. "):=()("inch "); ({SHEAD|" "})("inc. "):=()("incorporated "); ({SHEAD|" "})("Inst. "):=()("Institute, Institution "); ({SHEAD|" "})("Jan. "):=()("January "); ({SHEAD|" "})("Jr. "):=()("Junior "); ({SHEAD|" "})("lat. "):=()("latitude "); ({SHEAD|" "})("Lib. "):=()("Library "); ({SHEAD|" "})("Lk. "):=()("Lake "); ({SHEAD|" "})("Ln. "):=()("Lane "); ({SHEAD|" "})("long. "):=()("longitude "); ({SHEAD|" "})("Lt. "):=()("Lieutenant "); ({SHEAD|" "})("Ltd. "):=()("Limited "); ({SHEAD|" "})("M.D. "):=()("Medical Doctor "); ({SHEAD|" "})("mo. "):=()("month "); ({SHEAD|" "})("Mr. "):=()("Mister "); ({SHEAD|" "})("Mrs. "):=()("Mistress "); ({SHEAD|" "})("Ms. "):=()("Miss "); ({SHEAD|" "})("Msgr. "):=()("Monsignor "); ({SHEAD|" "})("mt. "):=()("mountain "); ({SHEAD|" "})("mts. "):=()("mountains "); ({SHEAD|" "})("Mus. "):=()("Museum "); ({SHEAD|" "})("mus. "):=()("museum "); ({SHEAD|" "})("no. "):=()("number "); ({SHEAD|" "})("Nov. "):=()("November "); ({SHEAD|" "})("Oct. "):=()("October "); ({SHEAD|" "})("Op. "):=()("Opus "); ({SHEAD|" "})("oz. "):=()("ounce "); ({SHEAD|" "})("p. "):=()("page "); ({SHEAD|" "})("P.M. "):=()("PM "); ({SHEAD|" "})("pl. "):=()("plural "); ({SHEAD|" "})("pop. "):=()("population "); ({SHEAD|" "})("Prof. "):=()("Professor "); ({SHEAD|" "})("pseud. "):=()("pseudonym "); ({SHEAD|" "})("pt. "):=()("pint "); ({SHEAD|" "})("pub. "):=()("published "); ({SHEAD|" "})("qt. "):=()("quart "); ({SHEAD|" "})("R.N. "):=()("Registered Nurse "); ({SHEAD|" "})("Rd. "):=()("Road "); ({SHEAD|" "})("Rev. "):=()("the Reverend "); ({SHEAD|" "})("rev. "):=()("revised "); ({SHEAD|" "})("Sept. "):=()("September "); ({SHEAD|" "})("Ser. "):=()("Series "); ({SHEAD|" "})("Sgt. "):=()("Sergeant "); ({SHEAD|" "})("Sr. "):=()("Senior "); ({SHEAD|" "})("St. "):=()("Saint "); ({SHEAD|" "})("St. "):=()("Street "); ({SHEAD|" "})("Sta. "):=()("Station "); ({SHEAD|" "})("ste. "):=()("suite "); ({SHEAD|" "})("Sun. "):=()("Sunday "); ({SHEAD|" "})("Ter. "):=()("Terrace "); ({SHEAD|" "})("Tpk. "):=()("Turnpike "); ({SHEAD|" "})("U.S. "):=()("United States "); ({SHEAD|" "})("uninc. "):=()("unincorporated "); ({SHEAD|" "})("Univ. "):=()("University "); ({SHEAD|" "})("U.S.A. "):=()("United States of America "); ({SHEAD|" "})("vol. "):=()("volume "); ({SHEAD|" "})("vs. "):=()("versus "); ({SHEAD|" "})("wt. "):=()("weight "); ;============================================================================================ ;III. DATES ;Convert dates to the format dd/mm/yyyy ;============================================================================================ ;Months ("Jan "):=("January "); ("Feb "):=("February "); ("Mar "):=("March "); ("Apr "):=("April "); ("Jun "):=("June "); ("Jul "):=("July "); ("Aug "):=("August "); ("Sep "):=("September "); ("Oct "):=("October "); ("Nov "):=("November "); ("Dec "):=("December "); ;5(th) (of) October(,) 2004 ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("January",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("01")("/")(%y); ("/[1-29]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("February",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("02")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("March",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("03")("/")(%y); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("April",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("04")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("May",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("05")("/")(%y); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("June",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("06")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("July",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("07")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("August",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("08")("/")(%y); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("September",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("09")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("October",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("10")("/")(%y); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("November",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("11")("/")(%y); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("December",%m)("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("12")("/")(%y); ;5(th) (of) October ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("January",%m):=(%d)("/")("01"); ("/[1-29]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("February",%m):=(%d)("/")("02"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("March",%m):=(%d)("/")("03"); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("April",%m):=(%d)("/")("04"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("May",%m):=(%d)("/")("05"); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("June",%m):=(%d)("/")("06"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("July",%m):=(%d)("/")("07"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("August",%m):=(%d)("/")("08"); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("September",%m):=(%d)("/")("09"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("October",%m):=(%d)("/")("10"); ("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("November",%m):=(%d)("/")("11"); ("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(\s|\sof\s)/")("December",%m):=(%d)("/")("12"); ;October (the) 5(th), 2004 ("January",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("01")("/")(%y); ("February",%m)("/(\s|\sthe\s)/")("/[1-29]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("02")("/")(%y); ("March",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("03")("/")(%y); ("April",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("04")("/")(%y); ("May",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("05")("/")(%y); ("June",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("06")("/")(%y); ("July",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("07")("/")(%y); ("August",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("08")("/")(%y); ("September",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("09")("/")(%y); ("October",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("10")("/")(%y); ("November",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("11")("/")(%y); ("December",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/")("/(,)?/")(" ")("/\d{3,4}/",%y):=(%d)("/")("12")("/")(%y); ;October (the) 5(th) ("January",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("01"); ("February",%m)("/(\s|\sthe\s)/")("/[1-29]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("02"); ("March",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("03"); ("April",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("04"); ("May",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("05"); ("June",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("06"); ("July",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("07"); ("August",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("08"); ("September",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("09"); ("October",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("10"); ("November",%m)("/(\s|\sthe\s)/")("/[1-30]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("11"); ("December",%m)("/(\s|\sthe\s)/")("/[1-31]/",%d)("/(st|nd|rd|th)?/"):=(%d)("/")("12"); ;============================================================================================ ;IV. REORDERING ;Only immediate constituents ;============================================================================================ (STAIL,%a)("/(Do|Have)/",%b)(" ",%c)("/(I/you/we/they)/",%d)(" ",%e):=(%a)(%d)(%c)(%b)(%e); Do I > I Do (STAIL,%a)("/(Does|Has)/",%b)(" ",%c)("/(he/she/it)/",%d)(" ",%e):=(%a)(%d)(%c)(%b)(%e); Does he > he Does (STAIL,%a)("/(Can|Could"Had|May|Must|Should|Will|Would|")/",%b)(" ",%c)("/(he/she/it)/",%d)(" ",%e):=(%a)(%d)(%c)(%b)(%e); Does he > he Does ;============================================================================================ ;V. SEGMENTATION ;============================================================================================ (".",%end)(" ",%blank):=(%end)(%new,+STAIL); ("?",%end)(" ",%blank):=(%end)(%new,+STAIL); ("!",%end)(" ",%blank):=(%end)(%new,+STAIL);