porterStemmer.m 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. function stem = porterStemmer(inString)
  2. % Applies the Porter Stemming algorithm as presented in the following
  3. % paper:
  4. % Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  5. % no. 3, pp 130-137
  6. % Original code modeled after the C version provided at:
  7. % http://www.tartarus.org/~martin/PorterStemmer/c.txt
  8. % The main part of the stemming algorithm starts here. b is an array of
  9. % characters, holding the word to be stemmed. The letters are in b[k0],
  10. % b[k0+1] ending at b[k]. In fact k0 = 1 in this demo program (since
  11. % matlab begins indexing by 1 instead of 0). k is readjusted downwards as
  12. % the stemming progresses. Zero termination is not in fact used in the
  13. % algorithm.
  14. % To call this function, use the string to be stemmed as the input
  15. % argument. This function returns the stemmed word as a string.
  16. % Lower-case string
  17. inString = lower(inString);
  18. global j;
  19. b = inString;
  20. k = length(b);
  21. k0 = 1;
  22. j = k;
  23. % With this if statement, strings of length 1 or 2 don't go through the
  24. % stemming process. Remove this conditional to match the published
  25. % algorithm.
  26. stem = b;
  27. if k > 2
  28. % Output displays per step are commented out.
  29. %disp(sprintf('Word to stem: %s', b));
  30. x = step1ab(b, k, k0);
  31. %disp(sprintf('Steps 1A and B yield: %s', x{1}));
  32. x = step1c(x{1}, x{2}, k0);
  33. %disp(sprintf('Step 1C yields: %s', x{1}));
  34. x = step2(x{1}, x{2}, k0);
  35. %disp(sprintf('Step 2 yields: %s', x{1}));
  36. x = step3(x{1}, x{2}, k0);
  37. %disp(sprintf('Step 3 yields: %s', x{1}));
  38. x = step4(x{1}, x{2}, k0);
  39. %disp(sprintf('Step 4 yields: %s', x{1}));
  40. x = step5(x{1}, x{2}, k0);
  41. %disp(sprintf('Step 5 yields: %s', x{1}));
  42. stem = x{1};
  43. end
  44. % cons(j) is TRUE <=> b[j] is a consonant.
  45. function c = cons(i, b, k0)
  46. c = true;
  47. switch(b(i))
  48. case {'a', 'e', 'i', 'o', 'u'}
  49. c = false;
  50. case 'y'
  51. if i == k0
  52. c = true;
  53. else
  54. c = ~cons(i - 1, b, k0);
  55. end
  56. end
  57. % mseq() measures the number of consonant sequences between k0 and j. If
  58. % c is a consonant sequence and v a vowel sequence, and <..> indicates
  59. % arbitrary presence,
  60. % <c><v> gives 0
  61. % <c>vc<v> gives 1
  62. % <c>vcvc<v> gives 2
  63. % <c>vcvcvc<v> gives 3
  64. % ....
  65. function n = measure(b, k0)
  66. global j;
  67. n = 0;
  68. i = k0;
  69. while true
  70. if i > j
  71. return
  72. end
  73. if ~cons(i, b, k0)
  74. break;
  75. end
  76. i = i + 1;
  77. end
  78. i = i + 1;
  79. while true
  80. while true
  81. if i > j
  82. return
  83. end
  84. if cons(i, b, k0)
  85. break;
  86. end
  87. i = i + 1;
  88. end
  89. i = i + 1;
  90. n = n + 1;
  91. while true
  92. if i > j
  93. return
  94. end
  95. if ~cons(i, b, k0)
  96. break;
  97. end
  98. i = i + 1;
  99. end
  100. i = i + 1;
  101. end
  102. % vowelinstem() is TRUE <=> k0,...j contains a vowel
  103. function vis = vowelinstem(b, k0)
  104. global j;
  105. for i = k0:j,
  106. if ~cons(i, b, k0)
  107. vis = true;
  108. return
  109. end
  110. end
  111. vis = false;
  112. %doublec(i) is TRUE <=> i,(i-1) contain a double consonant.
  113. function dc = doublec(i, b, k0)
  114. if i < k0+1
  115. dc = false;
  116. return
  117. end
  118. if b(i) ~= b(i-1)
  119. dc = false;
  120. return
  121. end
  122. dc = cons(i, b, k0);
  123. % cvc(j) is TRUE <=> j-2,j-1,j has the form consonant - vowel - consonant
  124. % and also if the second c is not w,x or y. this is used when trying to
  125. % restore an e at the end of a short word. e.g.
  126. %
  127. % cav(e), lov(e), hop(e), crim(e), but
  128. % snow, box, tray.
  129. function c1 = cvc(i, b, k0)
  130. if ((i < (k0+2)) || ~cons(i, b, k0) || cons(i-1, b, k0) || ~cons(i-2, b, k0))
  131. c1 = false;
  132. else
  133. if (b(i) == 'w' || b(i) == 'x' || b(i) == 'y')
  134. c1 = false;
  135. return
  136. end
  137. c1 = true;
  138. end
  139. % ends(s) is TRUE <=> k0,...k ends with the string s.
  140. function s = ends(str, b, k)
  141. global j;
  142. if (str(length(str)) ~= b(k))
  143. s = false;
  144. return
  145. end % tiny speed-up
  146. if (length(str) > k)
  147. s = false;
  148. return
  149. end
  150. if strcmp(b(k-length(str)+1:k), str)
  151. s = true;
  152. j = k - length(str);
  153. return
  154. else
  155. s = false;
  156. end
  157. % setto(s) sets (j+1),...k to the characters in the string s, readjusting
  158. % k accordingly.
  159. function so = setto(s, b, k)
  160. global j;
  161. for i = j+1:(j+length(s))
  162. b(i) = s(i-j);
  163. end
  164. if k > j+length(s)
  165. b((j+length(s)+1):k) = '';
  166. end
  167. k = length(b);
  168. so = {b, k};
  169. % rs(s) is used further down.
  170. % [Note: possible null/value for r if rs is called]
  171. function r = rs(str, b, k, k0)
  172. r = {b, k};
  173. if measure(b, k0) > 0
  174. r = setto(str, b, k);
  175. end
  176. % step1ab() gets rid of plurals and -ed or -ing. e.g.
  177. % caresses -> caress
  178. % ponies -> poni
  179. % ties -> ti
  180. % caress -> caress
  181. % cats -> cat
  182. % feed -> feed
  183. % agreed -> agree
  184. % disabled -> disable
  185. % matting -> mat
  186. % mating -> mate
  187. % meeting -> meet
  188. % milling -> mill
  189. % messing -> mess
  190. % meetings -> meet
  191. function s1ab = step1ab(b, k, k0)
  192. global j;
  193. if b(k) == 's'
  194. if ends('sses', b, k)
  195. k = k-2;
  196. elseif ends('ies', b, k)
  197. retVal = setto('i', b, k);
  198. b = retVal{1};
  199. k = retVal{2};
  200. elseif (b(k-1) ~= 's')
  201. k = k-1;
  202. end
  203. end
  204. if ends('eed', b, k)
  205. if measure(b, k0) > 0;
  206. k = k-1;
  207. end
  208. elseif (ends('ed', b, k) || ends('ing', b, k)) && vowelinstem(b, k0)
  209. k = j;
  210. retVal = {b, k};
  211. if ends('at', b, k)
  212. retVal = setto('ate', b(k0:k), k);
  213. elseif ends('bl', b, k)
  214. retVal = setto('ble', b(k0:k), k);
  215. elseif ends('iz', b, k)
  216. retVal = setto('ize', b(k0:k), k);
  217. elseif doublec(k, b, k0)
  218. retVal = {b, k-1};
  219. if b(retVal{2}) == 'l' || b(retVal{2}) == 's' || ...
  220. b(retVal{2}) == 'z'
  221. retVal = {retVal{1}, retVal{2}+1};
  222. end
  223. elseif measure(b, k0) == 1 && cvc(k, b, k0)
  224. retVal = setto('e', b(k0:k), k);
  225. end
  226. k = retVal{2};
  227. b = retVal{1}(k0:k);
  228. end
  229. j = k;
  230. s1ab = {b(k0:k), k};
  231. % step1c() turns terminal y to i when there is another vowel in the stem.
  232. function s1c = step1c(b, k, k0)
  233. global j;
  234. if ends('y', b, k) && vowelinstem(b, k0)
  235. b(k) = 'i';
  236. end
  237. j = k;
  238. s1c = {b, k};
  239. % step2() maps double suffices to single ones. so -ization ( = -ize plus
  240. % -ation) maps to -ize etc. note that the string before the suffix must give
  241. % m() > 0.
  242. function s2 = step2(b, k, k0)
  243. global j;
  244. s2 = {b, k};
  245. switch b(k-1)
  246. case {'a'}
  247. if ends('ational', b, k) s2 = rs('ate', b, k, k0);
  248. elseif ends('tional', b, k) s2 = rs('tion', b, k, k0); end;
  249. case {'c'}
  250. if ends('enci', b, k) s2 = rs('ence', b, k, k0);
  251. elseif ends('anci', b, k) s2 = rs('ance', b, k, k0); end;
  252. case {'e'}
  253. if ends('izer', b, k) s2 = rs('ize', b, k, k0); end;
  254. case {'l'}
  255. if ends('bli', b, k) s2 = rs('ble', b, k, k0);
  256. elseif ends('alli', b, k) s2 = rs('al', b, k, k0);
  257. elseif ends('entli', b, k) s2 = rs('ent', b, k, k0);
  258. elseif ends('eli', b, k) s2 = rs('e', b, k, k0);
  259. elseif ends('ousli', b, k) s2 = rs('ous', b, k, k0); end;
  260. case {'o'}
  261. if ends('ization', b, k) s2 = rs('ize', b, k, k0);
  262. elseif ends('ation', b, k) s2 = rs('ate', b, k, k0);
  263. elseif ends('ator', b, k) s2 = rs('ate', b, k, k0); end;
  264. case {'s'}
  265. if ends('alism', b, k) s2 = rs('al', b, k, k0);
  266. elseif ends('iveness', b, k) s2 = rs('ive', b, k, k0);
  267. elseif ends('fulness', b, k) s2 = rs('ful', b, k, k0);
  268. elseif ends('ousness', b, k) s2 = rs('ous', b, k, k0); end;
  269. case {'t'}
  270. if ends('aliti', b, k) s2 = rs('al', b, k, k0);
  271. elseif ends('iviti', b, k) s2 = rs('ive', b, k, k0);
  272. elseif ends('biliti', b, k) s2 = rs('ble', b, k, k0); end;
  273. case {'g'}
  274. if ends('logi', b, k) s2 = rs('log', b, k, k0); end;
  275. end
  276. j = s2{2};
  277. % step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
  278. function s3 = step3(b, k, k0)
  279. global j;
  280. s3 = {b, k};
  281. switch b(k)
  282. case {'e'}
  283. if ends('icate', b, k) s3 = rs('ic', b, k, k0);
  284. elseif ends('ative', b, k) s3 = rs('', b, k, k0);
  285. elseif ends('alize', b, k) s3 = rs('al', b, k, k0); end;
  286. case {'i'}
  287. if ends('iciti', b, k) s3 = rs('ic', b, k, k0); end;
  288. case {'l'}
  289. if ends('ical', b, k) s3 = rs('ic', b, k, k0);
  290. elseif ends('ful', b, k) s3 = rs('', b, k, k0); end;
  291. case {'s'}
  292. if ends('ness', b, k) s3 = rs('', b, k, k0); end;
  293. end
  294. j = s3{2};
  295. % step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
  296. function s4 = step4(b, k, k0)
  297. global j;
  298. switch b(k-1)
  299. case {'a'}
  300. if ends('al', b, k) end;
  301. case {'c'}
  302. if ends('ance', b, k)
  303. elseif ends('ence', b, k) end;
  304. case {'e'}
  305. if ends('er', b, k) end;
  306. case {'i'}
  307. if ends('ic', b, k) end;
  308. case {'l'}
  309. if ends('able', b, k)
  310. elseif ends('ible', b, k) end;
  311. case {'n'}
  312. if ends('ant', b, k)
  313. elseif ends('ement', b, k)
  314. elseif ends('ment', b, k)
  315. elseif ends('ent', b, k) end;
  316. case {'o'}
  317. if ends('ion', b, k)
  318. if j == 0
  319. elseif ~(strcmp(b(j),'s') || strcmp(b(j),'t'))
  320. j = k;
  321. end
  322. elseif ends('ou', b, k) end;
  323. case {'s'}
  324. if ends('ism', b, k) end;
  325. case {'t'}
  326. if ends('ate', b, k)
  327. elseif ends('iti', b, k) end;
  328. case {'u'}
  329. if ends('ous', b, k) end;
  330. case {'v'}
  331. if ends('ive', b, k) end;
  332. case {'z'}
  333. if ends('ize', b, k) end;
  334. end
  335. if measure(b, k0) > 1
  336. s4 = {b(k0:j), j};
  337. else
  338. s4 = {b(k0:k), k};
  339. end
  340. % step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
  341. function s5 = step5(b, k, k0)
  342. global j;
  343. j = k;
  344. if b(k) == 'e'
  345. a = measure(b, k0);
  346. if (a > 1) || ((a == 1) && ~cvc(k-1, b, k0))
  347. k = k-1;
  348. end
  349. end
  350. if (b(k) == 'l') && doublec(k, b, k0) && (measure(b, k0) > 1)
  351. k = k-1;
  352. end
  353. s5 = {b(k0:k), k};