001/*
002 * Copyright 2011 Atteo.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014package org.atteo.evo.inflector;
015
016/**
017 * Transforms English words from singular to plural form.
018 * <p>
019 * Examples:
020 * <pre>
021 *    English.plural("word") = "words";
022 *
023 *    English.plural("cat", 1) = "cat";
024 *    English.plural("cat", 2) = "cats";
025 * </pre>
026 * </p>
027 * <p>
028 * Based on <a href="http://www.csse.monash.edu.au/~damian/papers/HTML/Plurals.html">
029 * An Algorithmic Approach to English Pluralization</a> by Damian Conway.
030 * </p>
031 */
032public class English extends TwoFormInflector {
033    public static enum MODE {
034        ENGLISH_ANGLICIZED, ENGLISH_CLASSICAL
035    }
036
037    private static final String[] CATEGORY_EX_ICES = { "codex", "murex",
038            "silex", };
039
040    private static final String[] CATEGORY_IX_ICES = { "radix", "helix", };
041
042    private static final String[] CATEGORY_UM_A = { "bacterium",
043            "agendum", "desideratum", "erratum", "stratum", "datum", "ovum",
044            "extremum", "candelabrum", };
045
046    // Always us -> i
047    private static final String[] CATEGORY_US_I = { "alumnus", "alveolus",
048            "bacillus", "bronchus", "locus", "nucleus", "stimulus", "meniscus",
049            "thesaurus", };
050
051    private static final String[] CATEGORY_ON_A = { "criterion",
052            "perihelion", "aphelion", "phenomenon", "prolegomenon", "noumenon",
053            "organon", "asyndeton", "hyperbaton", };
054
055    private static final String[] CATEGORY_A_AE = { "alumna", "alga",
056            "vertebra", "persona" };
057
058    // Always o -> os
059    private static final String[] CATEGORY_O_OS = { "albino",
060            "archipelago", "armadillo", "commando", "crescendo", "fiasco",
061            "ditto", "dynamo", "embryo", "ghetto", "guano", "inferno", "jumbo",
062            "lumbago", "magneto", "manifesto", "medico", "octavo", "photo",
063            "pro", "quarto", "canto", "lingo", "generalissimo", "stylo",
064            "rhino", "casino", "auto", "macro", "zero", "todo"
065    };
066
067    // Classical o -> i  (normally -> os)
068    private static final String[] CATEGORY_O_I = {
069            "solo", "soprano", "basso", "alto", "contralto", "tempo", "piano",
070            "virtuoso", };
071
072    private static final String[] CATEGORY_EN_INA = {
073            "stamen", "foramen", "lumen"
074    };
075
076    // -a to -as (anglicized) or -ata (classical)
077    private static final String[] CATEGORY_A_ATA = {
078            "anathema", "enema", "oedema", "bema", "enigma", "sarcoma",
079            "carcinoma", "gumma", "schema", "charisma", "lemma", "soma",
080            "diploma", "lymphoma", "stigma", "dogma", "magma", "stoma",
081            "drama", "melisma", "trauma", "edema", "miasma"
082    };
083
084    private static final String[] CATEGORY_IS_IDES = {
085            "iris", "clitoris"
086    };
087
088    // -us to -uses (anglicized) or -us (classical)
089    private static final String[] CATEGORY_US_US = {
090            "apparatus", "impetus", "prospectus", "cantus", "nexus", "sinus", "coitus",
091            "plexus", "status", "hiatus"
092    };
093
094    private static final String[] CATEGORY_NONE_I = {
095        "afreet", "afrit", "efreet"
096    };
097
098    private static final String[] CATEGORY_NONE_IM = {
099        "cherub", "goy", "seraph"
100    };
101
102    private static final String[] CATEGORY_EX_EXES = {
103        "apex", "latex", "vertex", "cortex", "pontifex", "vortex", "index", "simplex"
104    };
105
106    private static final String[] CATEGORY_IX_IXES = {
107        "appendix"
108    };
109
110    private static final String[] CATEGORY_S_ES = {
111        "acropolis", "chaos", "lens", "aegis",
112        "cosmos", "mantis", "alias", "dais", "marquis", "asbestos",
113        "digitalis", "metropolis", "atlas", "epidermis", "pathos",
114        "bathos", "ethos", "pelvis", "bias", "gas", "polis", "caddis",
115        "glottis", "rhinoceros", "cannabis", "glottis", "sassafras",
116        "canvas", "ibis", "trellis"
117    };
118
119    private static final String[] CATEGORY_MAN_MANS = {
120        "human", "Alabaman", "Bahaman", "Burman", "German", "Hiroshiman", "Liman", "Nakayaman", "Oklahoman",
121        "Panaman", "Selman", "Sonaman", "Tacoman", "Yakiman", "Yokohaman", "Yuman"
122    };
123
124    private static English inflector = new English();
125
126
127    public English() {
128        this(MODE.ENGLISH_ANGLICIZED);
129    }
130
131    public English(MODE mode) {
132
133        uncountable(new String[] {
134            // 2. Handle words that do not inflect in the plural (such as fish, travois, chassis, nationalities ending
135            // endings
136            "fish", "ois", "sheep", "deer", "pox", "itis",
137
138            // words
139            "bison", "flounder", "pliers", "bream",
140            "gallows", "proceedings", "breeches", "graffiti", "rabies",
141            "britches", "headquarters", "salmon", "carp", "herpes",
142            "scissors", "chassis", "high-jinks", "sea-bass", "clippers",
143            "homework", "series", "cod", "innings", "shears",
144            "contretemps", "jackanapes", "species", "corps", "mackerel",
145            "swine", "debris", "measles", "trout", "diabetes", "mews",
146            "tuna", "djinn", "mumps", "whiting", "eland", "news",
147            "wildebeest", "elk", "pincers", "sugar" });
148
149        // 4. Handle standard irregular plurals (mongooses, oxen, etc.)
150
151        irregular(new String[][] {
152                { "child", "children" }, // classical
153                { "ephemeris", "ephemerides" }, // classical
154                { "mongoose", "mongoose" }, // anglicized
155                { "mythos", "mythoi" }, // classical
156                // TODO: handle entire word correctly
157                //{ "ox", "oxen" }, // classical
158                { "soliloquy", "soliloquies" }, // anglicized
159                { "trilby", "trilbys" }, // anglicized
160                { "genus", "genera" }, // classical
161                { "quiz", "quizzes" },
162        });
163
164        if (mode == MODE.ENGLISH_ANGLICIZED) {
165            // Anglicized plural
166            irregular(new String[][] {
167                    { "beef", "beefs" },
168                    { "brother", "brothers" },
169                    { "cow", "cows" },
170                    { "genie", "genies" },
171                    { "money", "moneys" },
172                    { "octopus", "octopuses" },
173                    { "opus", "opuses" },
174                });
175        } else if (mode == MODE.ENGLISH_CLASSICAL) {
176            // Classical plural
177            irregular(new String[][] { { "beef", "beeves"},
178                    { "brother", "brethren" },
179                    { "cow", "kine" }, { "genie", "genii"},
180                    { "money", "monies" },
181                    { "octopus", "octopodes" },
182                    { "opus", "opera" },
183            });
184        }
185
186        categoryRule(CATEGORY_MAN_MANS, "", "s");
187
188        // questionable
189        /*
190         rule(new String[][] {
191                { "(ness)$", "$1" },
192                { "(ality)$", "$1" }
193                { "(icity)$", "$1" },
194                { "(ivity)$", "$1" },
195        });
196         */
197        // 5. Handle irregular inflections for common suffixes
198        rule(new String[][] {
199                { "man$", "men" },
200                { "([lm])ouse$", "$1ice" },
201                { "tooth$", "teeth" },
202                { "goose$", "geese" },
203                { "foot$", "feet" },
204                { "zoon$", "zoa" },
205                { "([csx])is$", "$1es" },
206        });
207
208        // 6. Handle fully assimilated classical inflections
209        categoryRule(CATEGORY_EX_ICES, "ex", "ices");
210        categoryRule(CATEGORY_IX_ICES, "ix", "ices");
211        categoryRule(CATEGORY_UM_A, "um", "a");
212        categoryRule(CATEGORY_ON_A, "on", "a");
213        categoryRule(CATEGORY_A_AE, "a", "ae");
214
215        // 7. Handle classical variants of modern inflections
216        if (mode == MODE.ENGLISH_CLASSICAL) {
217            rule(new String[][]{
218                    { "trix$", "trices" },
219                    { "eau$", "eaux" },
220                    { "ieu$", "ieux" },
221                    { "(..[iay])nx$", "$1nges" },
222            });
223            categoryRule(CATEGORY_EN_INA, "en", "ina");
224            categoryRule(CATEGORY_A_ATA, "a", "ata");
225            categoryRule(CATEGORY_IS_IDES, "is", "ides");
226            categoryRule(CATEGORY_US_US, "", "");
227            categoryRule(CATEGORY_O_I, "o", "i");
228            categoryRule(CATEGORY_NONE_I, "", "i");
229            categoryRule(CATEGORY_NONE_IM, "", "im");
230            categoryRule(CATEGORY_EX_EXES, "ex", "ices");
231            categoryRule(CATEGORY_IX_IXES, "ix", "ices");
232        }
233
234        categoryRule(CATEGORY_US_I, "us", "i");
235
236        rule("([cs]h|[zx])$", "$1es");
237        categoryRule(CATEGORY_S_ES, "", "es");
238        categoryRule(CATEGORY_IS_IDES, "", "es");
239        categoryRule(CATEGORY_US_US, "", "es");
240        rule("(us)$", "$1es");
241        categoryRule(CATEGORY_A_ATA, "", "s");
242
243        // The suffixes -ch, -sh, and -ss all take -es in the plural (churches,
244        // classes, etc)...
245        rule(new String[][] { { "([cs])h$", "$1hes" }, { "ss$", "sses" } });
246
247        // Certain words ending in -f or -fe take -ves in the plural (lives,
248        // wolves, etc)...
249        rule(new String[][] {
250                { "([aeo]l)f$", "$1ves" },
251                { "([^d]ea)f$", "$1ves" },
252                { "(ar)f$", "$1ves" },
253                { "([nlw]i)fe$", "$1ves" }
254        });
255
256        // Words ending in -y take -ys
257        rule(new String[][] { { "([aeiou]y)$", "$1s" }, { "y$", "ies" }, });
258
259        // Some words ending in -o take -os (including does preceded by a vowel)
260        categoryRule(CATEGORY_O_I, "o", "os");
261        categoryRule(CATEGORY_O_OS, "o", "os");
262        rule("([aeiou]o)$", "$1s");
263        // The rest take -oes
264        rule("(o)$", "$1es");
265
266        rule("(ul)um$", "$1a");
267
268        categoryRule(CATEGORY_A_ATA, "", "es");
269
270        rule("(s)$", "$1es");
271
272        // Return empty string for empty string input
273        rule("^$", "");
274        // Otherwise, assume that the plural just adds -s
275        rule("$", "s");
276    }
277
278    /**
279     * Returns plural form of the given word.
280     *
281     * @param word word in singular form
282     * @return plural form of the word
283     */
284    @Override
285    public String getPlural(String word) {
286        return super.getPlural(word);
287    }
288
289    /**
290     * Returns singular or plural form of the word based on count.
291     *
292     * @param word word in singular form
293     * @param count word count
294     * @return form of the word correct for given count
295     */
296    public String getPlural(String word, int count) {
297        if (count == 1) {
298            return word;
299        }
300        return getPlural(word);
301    }
302
303    /**
304     * Returns plural form of the given word.
305     * <p>
306     * For instance:
307     * <pre>
308     * {@code
309     * English.plural("cat") == "cats";
310     * }
311     * </pre>
312     * </p>
313     * @param word word in singular form
314     * @return plural form of given word
315     */
316    public static String plural(String word) {
317        return inflector.getPlural(word);
318    }
319
320    /**
321     * Returns singular or plural form of the word based on count.
322     * <p>
323     * For instance:
324     * <pre>
325     * {@code
326     * English.plural("cat", 1) == "cat";
327     * English.plural("cat", 2) == "cats";
328     * }
329     * </pre>
330     * </p>
331     * @param word word in singular form
332     * @param count word count
333     * @return form of the word correct for given count
334     */
335    public static String plural(String word, int count) {
336        return inflector.getPlural(word, count);
337    }
338
339    public static void setMode(MODE mode) {
340        English newInflector = new English(mode);
341        inflector = newInflector;
342    }
343}