Apache HTTPD
mod_speling.c
Go to the documentation of this file.
1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "apr.h"
18#include "apr_file_io.h"
19#include "apr_strings.h"
20#include "apr_lib.h"
21
22#define APR_WANT_STRFUNC
23#include "apr_want.h"
24
25#include "httpd.h"
26#include "http_core.h"
27#include "http_config.h"
28#include "http_request.h"
29#include "http_log.h"
30
31/* mod_speling.c - by Alexei Kosut <[email protected]> June, 1996
32 *
33 * This module is transparent, and simple. It attempts to correct
34 * misspellings of URLs that users might have entered, namely by checking
35 * capitalizations. If it finds a match, it sends a redirect.
36 *
37 * Sep-1999 Hugo Haas <[email protected]>
38 * o Added a CheckCaseOnly option to check only miscapitalized words.
39 *
40 * 08-Aug-1997 <[email protected]>
41 * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in
42 * speling_module).
43 * o Integrated tcsh's "spelling correction" routine which allows one
44 * misspelling (character insertion/omission/typo/transposition).
45 * Rewrote it to ignore case as well. This ought to catch the majority
46 * of misspelled requests.
47 * o Commented out the second pass where files' suffixes are stripped.
48 * Given the better hit rate of the first pass, this rather ugly
49 * (request index.html, receive index.db ?!?!) solution can be
50 * omitted.
51 * o wrote a "kind of" html page for mod_speling
52 *
53 * Activate it with "CheckSpelling On"
54 */
55
56module AP_MODULE_DECLARE_DATA speling_module;
57
63
64/*
65 * Create a configuration specific to this module for a server or directory
66 * location, and fill it with the default settings.
67 *
68 * The API says that in the absence of a merge function, the record for the
69 * closest ancestor is used exclusively. That's what we want, so we don't
70 * bother to have such a function.
71 */
72
73static void *mkconfig(apr_pool_t *p)
74{
75 spconfig *cfg = apr_pcalloc(p, sizeof(spconfig));
76
77 cfg->enabled = 0;
78 cfg->check_case_only = 0;
79 cfg->check_basename_match = 1;
80 return cfg;
81}
82
83/*
84 * Respond to a callback to create configuration record for a server or
85 * vhost environment.
86 */
88{
89 return mkconfig(p);
90}
91
92/*
93 * Respond to a callback to create a config record for a specific directory.
94 */
96{
97 return mkconfig(p);
98}
99
100/*
101 * Define the directives specific to this module. This structure is referenced
102 * later by the 'module' structure.
103 */
104static const command_rec speling_cmds[] =
105{
106 AP_INIT_FLAG("CheckSpelling", ap_set_flag_slot,
108 "whether or not to fix miscapitalized/misspelled requests"),
109 AP_INIT_FLAG("CheckCaseOnly", ap_set_flag_slot,
110 (void*)APR_OFFSETOF(spconfig, check_case_only), OR_OPTIONS,
111 "whether or not to fix only miscapitalized requests"),
112 AP_INIT_FLAG("CheckBasenameMatch", ap_set_flag_slot,
113 (void*)APR_OFFSETOF(spconfig, check_basename_match), OR_OPTIONS,
114 "whether or not to fix files with the same base name"),
115 { NULL }
116};
117
127
128static const char *sp_reason_str[] =
129{
130 "identical",
131 "miscapitalized",
132 "transposed characters",
133 "character missing",
134 "extra character",
135 "mistyped character",
136 "common basename",
137};
138
139typedef struct {
140 const char *name;
143
144/*
145 * spdist() is taken from Kernighan & Pike,
146 * _The_UNIX_Programming_Environment_
147 * and adapted somewhat to correspond better to psychological reality.
148 * (Note the changes to the return values)
149 *
150 * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
151 * page 363, the correct order for this is:
152 * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
153 * thus, it was exactly backwards in the old version. -- PWP
154 *
155 * This routine was taken out of tcsh's spelling correction code
156 * (tcsh-6.07.04) and re-converted to apache data types ("char" type
157 * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
158 * during comparisons, so is a "approximate strcasecmp()".
159 * NOTE that is still allows only _one_ real "typo",
160 * it does NOT try to correct multiple errors.
161 */
162
163static sp_reason spdist(const char *s, const char *t)
164{
165 for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) {
166 if (*t == '\0') {
167 return SP_MISCAPITALIZED; /* exact match (sans case) */
168 }
169 }
170 if (*s) {
171 if (*t) {
172 if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1])
173 && apr_tolower(*t) == apr_tolower(s[1])
174 && strcasecmp(s + 2, t + 2) == 0) {
175 return SP_TRANSPOSITION; /* transposition */
176 }
177 if (strcasecmp(s + 1, t + 1) == 0) {
178 return SP_SIMPLETYPO; /* 1 char mismatch */
179 }
180 }
181 if (strcasecmp(s + 1, t) == 0) {
182 return SP_EXTRACHAR; /* extra character */
183 }
184 }
185 if (*t && strcasecmp(s, t + 1) == 0) {
186 return SP_MISSINGCHAR; /* missing character */
187 }
188 return SP_VERYDIFFERENT; /* distance too large to fix. */
189}
190
191static int sort_by_quality(const void *left, const void *rite)
192{
193 return (int) (((misspelled_file *) left)->quality)
194 - (int) (((misspelled_file *) rite)->quality);
195}
196
198{
199 spconfig *cfg;
200 char *good, *bad, *postgood, *url;
202 int filoc, dotloc, urlen, pglen;
204 apr_dir_t *dir;
205
206 cfg = ap_get_module_config(r->per_dir_config, &speling_module);
207 if (!cfg->enabled) {
208 return DECLINED;
209 }
210
211 /* We only want to worry about GETs */
212 if (r->method_number != M_GET) {
213 return DECLINED;
214 }
215
216 /* We've already got a file of some kind or another */
217 if (r->finfo.filetype != APR_NOFILE) {
218 return DECLINED;
219 }
220
221 /* Not a file request */
222 if (r->proxyreq || !r->filename) {
223 return DECLINED;
224 }
225
226 /* This is a sub request - don't mess with it */
227 if (r->main) {
228 return DECLINED;
229 }
230
231 /*
232 * The request should end up looking like this:
233 * r->uri: /correct-url/mispelling/more
234 * r->filename: /correct-file/mispelling r->path_info: /more
235 *
236 * So we do this in steps. First break r->filename into two pieces
237 */
238
239 filoc = ap_rind(r->filename, '/');
240 /*
241 * Don't do anything if the request doesn't contain a slash, or
242 * requests "/"
243 */
244 if (filoc == -1 || strcmp(r->uri, "/") == 0) {
245 return DECLINED;
246 }
247
248 /* good = /correct-file */
249 good = apr_pstrndup(r->pool, r->filename, filoc);
250 /* bad = mispelling */
251 bad = apr_pstrdup(r->pool, r->filename + filoc + 1);
252 /* postgood = mispelling/more */
254
255 urlen = strlen(r->uri);
256 pglen = strlen(postgood);
257
258 /* Check to see if the URL pieces add up */
259 if (strcmp(postgood, r->uri + (urlen - pglen))) {
260 return DECLINED;
261 }
262
263 /* url = /correct-url */
264 url = apr_pstrndup(r->pool, r->uri, (urlen - pglen));
265
266 /* Now open the directory and do ourselves a check... */
267 if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) {
268 /* Oops, not a directory... */
269 return DECLINED;
270 }
271
273
274 dotloc = ap_ind(bad, '.');
275 if (dotloc == -1) {
276 dotloc = strlen(bad);
277 }
278
280 sp_reason q;
281
282 /*
283 * If we end up with a "fixed" URL which is identical to the
284 * requested one, we must have found a broken symlink or some such.
285 * Do _not_ try to redirect this, it causes a loop!
286 */
287 if (strcmp(bad, dirent.name) == 0) {
289 return OK;
290 }
291
292 /*
293 * miscapitalization errors are checked first (like, e.g., lower case
294 * file, upper case request)
295 */
296 else if (strcasecmp(bad, dirent.name) == 0) {
298
300 sp_new->name = apr_pstrdup(r->pool, dirent.name);
301 sp_new->quality = SP_MISCAPITALIZED;
302 }
303
304 /*
305 * simple typing errors are checked next (like, e.g.,
306 * missing/extra/transposed char)
307 */
308 else if ((cfg->check_case_only == 0)
309 && ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT)) {
311
313 sp_new->name = apr_pstrdup(r->pool, dirent.name);
314 sp_new->quality = q;
315 }
316
317 /*
318 * The spdist() should have found the majority of the misspelled
319 * requests. It is of questionable use to continue looking for
320 * files with the same base name, but potentially of totally wrong
321 * type (index.html <-> index.db).
322 *
323 * If you're using MultiViews, and have a file named foobar.html,
324 * which you refer to as "foobar", and someone tried to access
325 * "Foobar", without CheckBasenameMatch, mod_speling won't find it,
326 * because it won't find anything matching that spelling.
327 * With the extension-munging, it would locate "foobar.html".
328 */
329 else if (cfg->check_basename_match == 1) {
330 /*
331 * Okay... we didn't find anything. Now we take out the hard-core
332 * power tools. There are several cases here. Someone might have
333 * entered a wrong extension (.htm instead of .html or vice
334 * versa) or the document could be negotiated. At any rate, now
335 * we just compare stuff before the first dot. If it matches, we
336 * figure we got us a match. This can result in wrong things if
337 * there are files of different content types but the same prefix
338 * (e.g. foo.gif and foo.html) This code will pick the first one
339 * it finds. Better than a Not Found, though.
340 */
341 int entloc = ap_ind(dirent.name, '.');
342 if (entloc == -1) {
343 entloc = strlen(dirent.name);
344 }
345
346 if ((dotloc == entloc)
347 && !strncasecmp(bad, dirent.name, dotloc)) {
349
351 sp_new->name = apr_pstrdup(r->pool, dirent.name);
352 sp_new->quality = SP_VERYDIFFERENT;
353 }
354 }
355 }
357
358 if (candidates->nelts != 0) {
359 /* Wow... we found us a mispelling. Construct a fixed url */
360 char *nuri;
361 const char *ref;
363 int i;
364
365 ref = apr_table_get(r->headers_in, "Referer");
366
367 qsort((void *) candidates->elts, candidates->nelts,
369
370 /*
371 * Conditions for immediate redirection:
372 * a) the first candidate was not found by stripping the suffix
373 * AND b) there exists only one candidate OR the best match is not
374 * ambiguous
375 * then return a redirection right away.
376 */
377 if (variant[0].quality != SP_VERYDIFFERENT
378 && (candidates->nelts == 1
379 || variant[0].quality != variant[1].quality)) {
380
382 variant[0].name,
383 r->path_info, NULL));
384 if (r->parsed_uri.query)
386
387 apr_table_setn(r->headers_out, "Location",
389
391 r,
392 ref ? APLOGNO(03224) "Fixed spelling: %s to %s from %s"
393 : APLOGNO(03225) "Fixed spelling: %s to %s%s",
394 r->uri, nuri,
395 (ref ? ref : ""));
396
398 }
399 /*
400 * Otherwise, a "[300] Multiple Choices" list with the variants is
401 * returned.
402 */
403 else {
404 apr_pool_t *p;
405 apr_table_t *notes;
409
410
411 if (r->main == NULL) {
412 p = r->pool;
413 notes = r->notes;
414 }
415 else {
416 p = r->main->pool;
417 notes = r->main->notes;
418 }
419
421 return DECLINED;
422 apr_pool_tag(sub_pool, "speling_sub");
423
424 t = apr_array_make(sub_pool, candidates->nelts * 8 + 8,
425 sizeof(char *));
426 v = apr_array_make(sub_pool, candidates->nelts * 5,
427 sizeof(char *));
428
429 /* Generate the response text. */
430
431 *(const char **)apr_array_push(t) =
432 "The document name you requested (<code>";
433 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri);
434 *(const char **)apr_array_push(t) =
435 "</code>) could not be found on this server.\n"
436 "However, we found documents with names similar "
437 "to the one you requested.<p>"
438 "Available documents:\n<ul>\n";
439
440 for (i = 0; i < candidates->nelts; ++i) {
441 char *vuri;
442 const char *reason;
443
444 reason = sp_reason_str[(int) (variant[i].quality)];
445 /* The format isn't very neat... */
447 (r->parsed_uri.query != NULL) ? "?" : "",
448 (r->parsed_uri.query != NULL)
449 ? r->parsed_uri.query : "",
450 NULL);
451 *(const char **)apr_array_push(v) = "\"";
452 *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri);
453 *(const char **)apr_array_push(v) = "\";\"";
454 *(const char **)apr_array_push(v) = reason;
455 *(const char **)apr_array_push(v) = "\"";
456
457 *(const char **)apr_array_push(t) = "<li><a href=\"";
458 *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri);
459 *(const char **)apr_array_push(t) = "\">";
460 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri);
461 *(const char **)apr_array_push(t) = "</a> (";
462 *(const char **)apr_array_push(t) = reason;
463 *(const char **)apr_array_push(t) = ")\n";
464
465 /*
466 * when we have printed the "close matches" and there are
467 * more "distant matches" (matched by stripping the suffix),
468 * then we insert an additional separator text to suggest
469 * that the user LOOK CLOSELY whether these are really the
470 * files she wanted.
471 */
472 if (i > 0 && i < candidates->nelts - 1
473 && variant[i].quality != SP_VERYDIFFERENT
474 && variant[i + 1].quality == SP_VERYDIFFERENT) {
475 *(const char **)apr_array_push(t) =
476 "</ul>\nFurthermore, the following related "
477 "documents were found:\n<ul>\n";
478 }
479 }
480 *(const char **)apr_array_push(t) = "</ul>\n";
481
482 /* If we know there was a referring page, add a note: */
483 if (ref != NULL) {
484 *(const char **)apr_array_push(t) =
485 "Please consider informing the owner of the "
486 "referring page <tt>";
487 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, ref);
488 *(const char **)apr_array_push(t) =
489 "</tt> about the broken link.\n";
490 }
491
492
493 /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */
494 apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0));
495
496 apr_table_mergen(r->subprocess_env, "VARIANTS",
497 apr_array_pstrcat(p, v, ','));
498
500
502 ref ? APLOGNO(03226) "Spelling fix: %s: %d candidates from %s"
503 : APLOGNO(03227) "Spelling fix: %s: %d candidates%s",
504 r->uri, candidates->nelts,
505 (ref ? ref : ""));
506
508 }
509 }
510
511 return OK;
512}
513
518
520{
522 create_mconfig_for_directory, /* create per-dir config */
523 NULL, /* merge per-dir config */
524 create_mconfig_for_server, /* server config */
525 NULL, /* merge server config */
526 speling_cmds, /* command apr_table_t */
527 register_hooks /* register hooks */
528};
APR File I/O Handling.
APR general purpose library routines.
APR Strings library.
APR Standard Headers Support.
#define ap_get_module_config(v, m)
#define AP_DECLARE_MODULE(foo)
#define AP_INIT_FLAG(directive, func, mconfig, where, help)
const char * ap_set_flag_slot(cmd_parms *cmd, void *struct_ptr, int arg)
Definition config.c:1512
request_rec * r
#define DECLINED
Definition httpd.h:457
#define OK
Definition httpd.h:456
char * ap_construct_url(apr_pool_t *p, const char *uri, request_rec *r)
Definition core.c:1246
#define APLOGNO(n)
Definition http_log.h:117
#define APLOG_INFO
Definition http_log.h:70
#define ap_log_rerror
Definition http_log.h:454
#define APLOG_MARK
Definition http_log.h:283
void ap_hook_fixups(ap_HOOK_fixups_t *pf, const char *const *aszPre, const char *const *aszSucc, int nOrder)
Definition request.c:87
int enabled
const char * url
Definition apr_escape.h:120
#define APR_HOOK_LAST
Definition apr_hooks.h:305
#define OR_OPTIONS
#define HTTP_MULTIPLE_CHOICES
Definition httpd.h:500
#define HTTP_MOVED_PERMANENTLY
Definition httpd.h:501
#define M_GET
Definition httpd.h:592
#define STANDARD20_MODULE_STUFF
#define ap_escape_uri(ppool, path)
Definition httpd.h:1836
int ap_rind(const char *str, char c)
Definition util.c:2401
#define ap_escape_html(p, s)
Definition httpd.h:1860
int ap_ind(const char *str, char c)
Definition util.c:2392
apr_size_t size
#define apr_tolower(c)
Definition apr_lib.h:231
#define APR_SUCCESS
Definition apr_errno.h:225
@ APR_NOFILE
#define APR_FINFO_DIRENT
int strcasecmp(const char *a, const char *b)
int strncasecmp(const char *a, const char *b, size_t n)
apr_interval_time_t t
#define apr_pool_create(newpool, parent)
Definition apr_pools.h:322
#define apr_pcalloc(p, size)
Definition apr_pools.h:465
apr_dir_t * dir
const char * s
Definition apr_strings.h:95
int nelts
Definition apr_tables.h:122
int reason
Apache Configuration.
CORE HTTP Daemon.
Apache Logging library.
Apache Request library.
HTTP Daemon routines.
apr_pool_t * p
Definition md_event.c:32
return NULL
Definition mod_so.c:359
int i
Definition mod_so.c:347
sp_reason
@ SP_TRANSPOSITION
@ SP_MISSINGCHAR
@ SP_MISCAPITALIZED
@ SP_EXTRACHAR
@ SP_VERYDIFFERENT
@ SP_SIMPLETYPO
@ SP_IDENTICAL
static const command_rec speling_cmds[]
static void * mkconfig(apr_pool_t *p)
Definition mod_speling.c:73
static int check_speling(request_rec *r)
static sp_reason spdist(const char *s, const char *t)
static void register_hooks(apr_pool_t *p)
static int sort_by_quality(const void *left, const void *rite)
static void * create_mconfig_for_directory(apr_pool_t *p, char *dir)
Definition mod_speling.c:95
static const char * sp_reason_str[]
static void * create_mconfig_for_server(apr_pool_t *p, server_rec *s)
Definition mod_speling.c:87
#define bad(x)
Definition sdbm.c:50
char * name
apr_filetype_e filetype
char * query
Definition apr_uri.h:101
sp_reason quality
const char * name
A structure that represents the current request.
Definition httpd.h:845
char * uri
Definition httpd.h:1016
apr_table_t * notes
Definition httpd.h:985
int method_number
Definition httpd.h:898
apr_pool_t * pool
Definition httpd.h:847
char * filename
Definition httpd.h:1018
apr_uri_t parsed_uri
Definition httpd.h:1092
int proxyreq
Definition httpd.h:873
apr_finfo_t finfo
Definition httpd.h:1094
apr_table_t * headers_in
Definition httpd.h:976
request_rec * main
Definition httpd.h:860
apr_table_t * subprocess_env
Definition httpd.h:983
struct ap_conf_vector_t * per_dir_config
Definition httpd.h:1047
char * path_info
Definition httpd.h:1024
apr_table_t * headers_out
Definition httpd.h:978
A structure to store information for each virtual server.
Definition httpd.h:1322
int check_case_only
Definition mod_speling.c:60
int check_basename_match
Definition mod_speling.c:61
int enabled
Definition mod_speling.c:59
apr_status_t apr_dir_read(apr_finfo_t *finfo, apr_int32_t wanted, apr_dir_t *thedir)
Definition dir.c:142
apr_status_t apr_dir_close(apr_dir_t *thedir)
Definition dir.c:109
apr_status_t apr_dir_open(apr_dir_t **new, const char *dirname, apr_pool_t *pool)
Definition dir.c:75
typedef int(WSAAPI *apr_winapi_fpt_WSAPoll)(IN OUT LPWSAPOLLFD fdArray