DeveelDB  20151217
complete SQL database system, primarly developed for .NET/Mono frameworks
Metaphone.cs
Go to the documentation of this file.
1 //
2 // Copyright 2010-2015 Deveel
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 using System;
18 using System.Text;
19 
20 namespace Deveel.Data.Text {
21  public sealed class Metaphone {
22  #region ctor
23  public Metaphone() {
24  }
25  #endregion
26 
27  #region Fields
28  private string vowels = "AEIOU";
29  private string frontv = "EIY";
30  private string varson = "CSPTG";
31  private int maxCodeLen = 4;
32  #endregion
33 
34  #region Private Methods
35  private bool IsLastChar(int wdsz, int n) {
36  return n + 1 == wdsz;
37  }
38 
39  private bool RegionMatch(StringBuilder sb, int index, String test) {
40  bool matches = false;
41  if( index >= 0 &&
42  (index + test.Length - 1) < sb.Length) {
43  string substring = sb.ToString(index, test.Length);
44  matches = substring.Equals(test);
45  }
46  return matches;
47  }
48 
49  private bool IsVowel(StringBuilder sb, int index) {
50  return (this.vowels.IndexOf(sb[index]) >= 0);
51  }
52 
53  private bool IsPreviousChar(StringBuilder sb, int index, char c) {
54  bool matches = false;
55  if( index > 0 && index < sb.Length) {
56  matches = sb[index - 1] == c;
57  }
58  return matches;
59  }
60 
61  private bool IsNextChar(StringBuilder sb, int index, char c) {
62  bool matches = false;
63  if (index >= 0 && index < sb.Length - 1) {
64  matches = sb[index + 1] == c;
65  }
66  return matches;
67  }
68  #endregion
69 
70  #region Public Methods
71  public string Compute(string s) {
72  if (string.IsNullOrEmpty(s))
73  return "";
74 
75  // single character is itself
76  if (s.Length == 1)
77  return s.ToUpper();
78 
79  char[] inwd = s.ToUpper().ToCharArray() ;
80 
81  StringBuilder local = new StringBuilder(40); // manipulate
82  StringBuilder code = new StringBuilder(10) ; // output
83  // handle initial 2 characters exceptions
84  switch(inwd[0]) {
85  case 'K' :
86  case 'G' :
87  case 'P' : /* looking for KN, etc*/
88  if (inwd[1] == 'N') {
89  local.Append(inwd, 1, inwd.Length - 1);
90  } else {
91  local.Append(inwd);
92  }
93  break;
94  case 'A': /* looking for AE */
95  if (inwd[1] == 'E') {
96  local.Append(inwd, 1, inwd.Length - 1);
97  } else {
98  local.Append(inwd);
99  }
100  break;
101  case 'W' : /* looking for WR or WH */
102  if (inwd[1] == 'R') { // WR -> R
103  local.Append(inwd, 1, inwd.Length - 1);
104  break ;
105  }
106  if (inwd[1] == 'H') {
107  local.Append(inwd, 1, inwd.Length - 1);
108  local[0] = 'W'; // WH -> W
109  } else {
110  local.Append(inwd);
111  }
112  break;
113  case 'X' : /* initial X becomes S */
114  inwd[0] = 'S';
115  local.Append(inwd);
116  break ;
117  default :
118  local.Append(inwd);
119  break;
120  } // now local has working string with initials fixed
121 
122  int wdsz = local.Length;
123  int n = 0 ;
124 
125  while ((code.Length < this.maxCodeLen) &&
126  (n < wdsz) ) { // max code size of 4 works well
127  char symb = local[n];
128  // remove duplicate letters except C
129  if ((symb != 'C') && (IsPreviousChar( local, n, symb )) ) {
130  n++ ;
131  } else { // not dup
132  switch(symb) {
133  case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
134  if (n == 0) {
135  code.Append(symb);
136  }
137  break ; // only use vowel if leading char
138  case 'B' :
139  if (IsPreviousChar(local, n, 'M') &&
140  IsLastChar(wdsz, n) ) { // B is silent if word ends in MB
141  break;
142  }
143  code.Append(symb);
144  break;
145  case 'C' : // lots of C special cases
146  /* discard if SCI, SCE or SCY */
147  if (IsPreviousChar(local, n, 'S') &&
148  !IsLastChar(wdsz, n) &&
149  (this.frontv.IndexOf(local[n + 1]) >= 0)) {
150  break;
151  }
152  if (RegionMatch(local, n, "CIA")) { // "CIA" -> X
153  code.Append('X');
154  break;
155  }
156  if (!IsLastChar(wdsz, n) &&
157  (this.frontv.IndexOf(local[n + 1]) >= 0)) {
158  code.Append('S');
159  break; // CI,CE,CY -> S
160  }
161  if (IsPreviousChar(local, n, 'S') &&
162  IsNextChar(local, n, 'H') ) { // SCH-&gtsk
163  code.Append('K') ;
164  break ;
165  }
166  if (IsNextChar(local, n, 'H')) { // detect CH
167  if ((n == 0) &&
168  (wdsz >= 3) &&
169  IsVowel(local,2) ) { // CH consonant -> K consonant
170  code.Append('K');
171  } else {
172  code.Append('X'); // CHvowel -> X
173  }
174  } else {
175  code.Append('K');
176  }
177  break ;
178  case 'D' :
179  if (!IsLastChar(wdsz, n + 1) &&
180  IsNextChar(local, n, 'G') &&
181  (this.frontv.IndexOf(local[n + 2]) >= 0)) { // DGE DGI DGY -> J
182  code.Append('J'); n += 2 ;
183  } else {
184  code.Append('T');
185  }
186  break ;
187  case 'G' : // GH silent at end or before consonant
188  if (IsLastChar(wdsz, n + 1) &&
189  IsNextChar(local, n, 'H')) {
190  break;
191  }
192  if (!IsLastChar(wdsz, n + 1) &&
193  IsNextChar(local,n,'H') &&
194  !IsVowel(local,n+2)) {
195  break;
196  }
197  if ((n > 0) &&
198  (RegionMatch(local, n, "GN") ||
199  RegionMatch(local, n, "GNED") ) ) {
200  break; // silent G
201  }
202  var hard = false ;
203  if (IsPreviousChar(local, n, 'G')) {
204  hard = true ;
205  } else {
206  hard = false ;
207  }
208  if (!IsLastChar(wdsz, n) &&
209  (this.frontv.IndexOf(local[n + 1]) >= 0) &&
210  (!hard)) {
211  code.Append('J');
212  } else {
213  code.Append('K');
214  }
215  break ;
216  case 'H':
217  if (IsLastChar(wdsz, n)) {
218  break ; // terminal H
219  }
220  if ((n > 0) &&
221  (this.varson.IndexOf(local[n - 1]) >= 0)) {
222  break;
223  }
224  if (IsVowel(local,n+1)) {
225  code.Append('H'); // Hvowel
226  }
227  break;
228  case 'F':
229  case 'J' :
230  case 'L' :
231  case 'M':
232  case 'N' :
233  case 'R' :
234  code.Append(symb);
235  break;
236  case 'K' :
237  if (n > 0) { // not initial
238  if (!IsPreviousChar(local, n, 'C')) {
239  code.Append(symb);
240  }
241  } else {
242  code.Append(symb); // initial K
243  }
244  break ;
245  case 'P' :
246  if (IsNextChar(local,n,'H')) {
247  // PH -> F
248  code.Append('F');
249  } else {
250  code.Append(symb);
251  }
252  break ;
253  case 'Q' :
254  code.Append('K');
255  break;
256  case 'S' :
257  if (RegionMatch(local,n,"SH") ||
258  RegionMatch(local,n,"SIO") ||
259  RegionMatch(local,n,"SIA")) {
260  code.Append('X');
261  } else {
262  code.Append('S');
263  }
264  break;
265  case 'T' :
266  if (RegionMatch(local,n,"TIA") ||
267  RegionMatch(local,n,"TIO")) {
268  code.Append('X');
269  break;
270  }
271  if (RegionMatch(local,n,"TCH")) {
272  // Silent if in "TCH"
273  break;
274  }
275  // substitute numeral 0 for TH (resembles theta after all)
276  if (RegionMatch(local,n,"TH")) {
277  code.Append('0');
278  } else {
279  code.Append('T');
280  }
281  break ;
282  case 'V' :
283  code.Append('F');
284  break ;
285  case 'W' : case 'Y' : // silent if not followed by vowel
286  if (!IsLastChar(wdsz,n) &&
287  IsVowel(local,n+1)) {
288  code.Append(symb);
289  }
290  break ;
291  case 'X' :
292  code.Append('K');
293  code.Append('S');
294  break ;
295  case 'Z' :
296  code.Append('S');
297  break ;
298  } // end switch
299  n++ ;
300  } // end else from symb != 'C'
301  if (code.Length > this.maxCodeLen) {
302  code.Length = maxCodeLen;
303  }
304  }
305  return code.ToString();
306  }
307  #endregion
308  }
309 }
bool RegionMatch(StringBuilder sb, int index, String test)
Definition: Metaphone.cs:39
bool IsVowel(StringBuilder sb, int index)
Definition: Metaphone.cs:49
bool IsPreviousChar(StringBuilder sb, int index, char c)
Definition: Metaphone.cs:53
bool IsNextChar(StringBuilder sb, int index, char c)
Definition: Metaphone.cs:61
bool IsLastChar(int wdsz, int n)
Definition: Metaphone.cs:35
string Compute(string s)
Definition: Metaphone.cs:71