NetSurf
urldb.c
Go to the documentation of this file.
1/*
2 * Copyright 2006 John M Bell <jmb202@ecs.soton.ac.uk>
3 * Copyright 2009 John Tytgat <joty@netsurf-browser.org>
4 *
5 * This file is part of NetSurf, http://www.netsurf-browser.org/
6 *
7 * NetSurf is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 * NetSurf is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * \file
22 * Unified URL information database implementation
23 *
24 * URLs are stored in a tree-based structure as follows:
25 *
26 * The host component is extracted from each URL and, if a FQDN, split on
27 * every '.'.The tree is constructed by inserting each FQDN segment in
28 * reverse order. Duplicate nodes are merged.
29 *
30 * If the host part of an URL is an IP address, then this is added to the
31 * tree verbatim (as if it were a TLD).
32 *
33 * This provides something looking like:
34 *
35 * root (a sentinel)
36 * |
37 * -------------------------------------------------
38 * | | | | | | |
39 * com edu gov 127.0.0.1 net org uk TLDs
40 * | | | | | |
41 * google ... ... ... ... co 2LDs
42 * | |
43 * www bbc Hosts/Subdomains
44 * |
45 * www ...
46 *
47 * Each of the nodes in this tree is a struct host_part. This stores the
48 * FQDN segment (or IP address) with which the node is concerned. Each node
49 * may contain further information about paths on a host (struct path_data)
50 * or SSL certificate processing on a host-wide basis
51 * (host_part::permit_invalid_certs).
52 *
53 * Path data is concerned with storing various metadata about the path in
54 * question. This includes global history data, HTTP authentication details
55 * and any associated HTTP cookies. This is stored as a tree of path segments
56 * hanging off the relevant host_part node.
57 *
58 * Therefore, to find the last visited time of the URL
59 * http://www.example.com/path/to/resource.html, the FQDN tree would be
60 * traversed in the order root -> "com" -> "example" -> "www". The "www"
61 * node would have attached to it a tree of struct path_data:
62 *
63 * (sentinel)
64 * |
65 * path
66 * |
67 * to
68 * |
69 * resource.html
70 *
71 * This represents the absolute path "/path/to/resource.html". The leaf node
72 * "resource.html" contains the last visited time of the resource.
73 *
74 * The mechanism described above is, however, not particularly conducive to
75 * fast searching of the database for a given URL (or URLs beginning with a
76 * given prefix). Therefore, an anciliary data structure is used to enable
77 * fast searching. This structure simply reflects the contents of the
78 * database, with entries being added/removed at the same time as for the
79 * core database. In order to ensure that degenerate cases are kept to a
80 * minimum, we use an AAtree. This is an approximation of a Red-Black tree
81 * with similar performance characteristics, but with a significantly
82 * simpler implementation. Entries in this tree comprise pointers to the
83 * leaf nodes of the host tree described above.
84 *
85 * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of
86 * non-normalised URLs with urldb will result in undefined behaviour and
87 * potential crashes.
88 */
89
90#include <assert.h>
91#include <stdbool.h>
92#include <stdio.h>
93#include <stdlib.h>
94#include <string.h>
95#include <strings.h>
96#include <time.h>
97#ifdef WITH_NSPSL
98#include <nspsl.h>
99#endif
100
101#include "utils/inet.h"
102#include "utils/nsoption.h"
103#include "utils/log.h"
104#include "utils/corestrings.h"
105#include "utils/url.h"
106#include "utils/utils.h"
107#include "utils/bloom.h"
108#include "utils/time.h"
109#include "utils/nsurl.h"
110#include "utils/ascii.h"
111#include "utils/http.h"
112#include "netsurf/bitmap.h"
114
115#include "content/content.h"
116#include "content/urldb.h"
117
118/**
119 * cookie entry.
120 *
121 * \warning This *must* be kept in sync with the public interface in
122 * netsurf/cookie_db.h
123 */
125 struct cookie_internal_data *prev; /**< Previous in list */
126 struct cookie_internal_data *next; /**< Next in list */
127
128 char *name; /**< Cookie name */
129 char *value; /**< Cookie value */
130 bool value_was_quoted; /**< Value was quoted in Set-Cookie: */
131 char *comment; /**< Cookie comment */
132 bool domain_from_set; /**< Domain came from Set-Cookie: header */
133 char *domain; /**< Domain */
134 bool path_from_set; /**< Path came from Set-Cookie: header */
135 char *path; /**< Path */
136 time_t expires; /**< Expiry timestamp, or -1 for session */
137 time_t last_used; /**< Last used time */
138 bool secure; /**< Only send for HTTPS requests */
139 bool http_only; /**< Only expose to HTTP(S) requests */
140 enum cookie_version version; /**< Specification compliance */
141 bool no_destroy; /**< Never destroy this cookie,
142 * unless it's expired */
143
144};
145
146
147/**
148 * A protection space
149 *
150 * This is defined as a tuple canonical_root_url and realm. This
151 * structure lives as linked list element in a leaf host_part struct
152 * so we need additional scheme and port to have a canonical_root_url.
153 */
155 /**
156 * URL scheme of canonical hostname of this protection space.
157 */
158 lwc_string *scheme;
159 /**
160 * Port number of canonical hostname of this protection
161 * space. When 0, it means the default port for given scheme,
162 * i.e. 80 (http), 443 (https).
163 */
164 unsigned int port;
165 /** Protection realm */
166 char *realm;
167
168 /**
169 * Authentication details for this protection space in form
170 * username:password
171 */
172 char *auth;
173 /** Next sibling */
175};
176
177
178/**
179 * meta data about a url
180 *
181 * \warning must be kept in sync with url_data structure in netsurf/url_db.h
182 */
184 char *title; /**< Resource title */
185 unsigned int visits; /**< Visit count */
186 time_t last_visit; /**< Last visit time */
187 content_type type; /**< Type of resource */
188};
189
190
191/**
192 * data entry for url
193 */
194struct path_data {
195 nsurl *url; /**< Full URL */
196 lwc_string *scheme; /**< URL scheme for data */
197 unsigned int port; /**< Port number for data. When 0, it means
198 * the default port for given scheme, i.e.
199 * 80 (http), 443 (https). */
200 char *segment; /**< Path segment for this node */
201 unsigned int frag_cnt; /**< Number of entries in path_data::fragment */
202 char **fragment; /**< Array of fragments */
203 bool persistent; /**< This entry should persist */
204
205 struct url_internal_data urld; /**< URL data for resource */
206
207 /**
208 * Protection space to which this resource belongs too. Can be
209 * NULL when it does not belong to a protection space or when
210 * it is not known. No ownership (is with struct host_part::prot_space).
211 */
213 /** Cookies associated with resource */
215 /** Last cookie in list */
217
218 struct path_data *next; /**< Next sibling */
219 struct path_data *prev; /**< Previous sibling */
220 struct path_data *parent; /**< Parent path segment */
221 struct path_data *children; /**< Child path segments */
222 struct path_data *last; /**< Last child */
223};
224
225struct hsts_data {
226 time_t expires; /**< Expiry time */
227 bool include_sub_domains; /**< Whether to include subdomains */
228};
229
230struct host_part {
231 /**
232 * Known paths on this host. This _must_ be first so that
233 * struct host_part *h = (struct host_part *)mypath; works
234 */
236 /**
237 * Allow access to SSL protected resources on this host
238 * without verifying certificate authenticity
239 */
241 /* HSTS data */
243
244 /**
245 * Part of host string
246 */
247 char *part;
248
249 /**
250 * Linked list of all known proctection spaces known for this
251 * host and all its schems and ports.
252 */
254
255 struct host_part *next; /**< Next sibling */
256 struct host_part *prev; /**< Previous sibling */
257 struct host_part *parent; /**< Parent host part */
258 struct host_part *children; /**< Child host parts */
259};
260
261
262/**
263 * search index node
264 */
266 const struct host_part *data; /**< Host tree entry */
267
268 unsigned int level; /**< Node level */
269
270 struct search_node *left; /**< Left subtree */
271 struct search_node *right; /**< Right subtree */
272};
273
274/** Root database handle */
275static struct host_part db_root;
276
277/** Search trees - one per letter + 1 for IPs + 1 for Everything Else */
278#define NUM_SEARCH_TREES 28
279#define ST_IP 0
280#define ST_EE 1
281#define ST_DN 2
282static struct search_node empty = { 0, 0, &empty, &empty };
284 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
285 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
286 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
287 &empty, &empty, &empty, &empty
288};
289
290/** Minimum cookie database file version */
291#define MIN_COOKIE_FILE_VERSION 100
292/** Current cookie database file version */
293#define COOKIE_FILE_VERSION 102
294/** loaded cookie file version */
296
297/** Minimum URL database file version */
298#define MIN_URL_FILE_VERSION 106
299/** Current URL database file version */
300#define URL_FILE_VERSION 107
301
302/**
303 * filter for url presence in database
304 *
305 * Bloom filter used for short-circuting the false case of "is this
306 * URL in the database?". BLOOM_SIZE controls how large the filter is
307 * in bytes. Primitive experimentation shows that for a filter of X
308 * bytes filled with X items, searching for X items not in the filter
309 * has a 5% false-positive rate. We set it to 32kB, which should be
310 * enough for all but the largest databases, while not being
311 * shockingly wasteful on memory.
312 */
313static struct bloom_filter *url_bloom;
314/**
315 * Size of url filter
316 */
317#define BLOOM_SIZE (1024 * 32)
318
319
320/**
321 * write a time_t to a file portably
322 *
323 * \param fp File to write to
324 * \param val the unix time value to output
325 * \return NSERROR_OK on success
326 */
327static nserror urldb_write_timet(FILE *fp, time_t val)
328{
329 int use;
330 char op[32];
331
332 use = nsc_sntimet(op, 32, &val);
333 if (use == 0) {
334 fprintf(fp, "%i\n", (int)val);
335 } else {
336 fprintf(fp, "%.*s\n", use, op);
337 }
338 return NSERROR_OK;
339}
340
341/**
342 * Write paths associated with a host
343 *
344 * \param parent Root of (sub)tree to write
345 * \param host Current host name
346 * \param fp File to write to
347 * \param path Current path string
348 * \param path_alloc Allocated size of path
349 * \param path_used Used size of path
350 * \param expiry Expiry time of URLs
351 */
352static void
354 const char *host,
355 FILE *fp,
356 char **path,
357 int *path_alloc,
358 int *path_used,
359 time_t expiry)
360{
361 const struct path_data *p = parent;
362 int i;
363
364 do {
365 int seglen = p->segment != NULL ? strlen(p->segment) : 0;
366 int len = *path_used + seglen + 1;
367
368 if (*path_alloc < len) {
369 char *temp;
370 temp = realloc(*path,
371 (len > 64) ? len : *path_alloc + 64);
372 if (!temp) {
373 return;
374 }
375 *path = temp;
376 *path_alloc = (len > 64) ? len : *path_alloc + 64;
377 }
378
379 if (p->segment != NULL) {
380 memcpy(*path + *path_used - 1, p->segment, seglen);
381 }
382
383 if (p->children != NULL) {
384 (*path)[*path_used + seglen - 1] = '/';
385 (*path)[*path_used + seglen] = '\0';
386 } else {
387 (*path)[*path_used + seglen - 1] = '\0';
388 len -= 1;
389 }
390
391 *path_used = len;
392
393 if (p->children != NULL) {
394 /* Drill down into children */
395 p = p->children;
396 } else {
397 /* leaf node */
398 if (p->persistent ||
399 ((p->urld.last_visit > expiry) &&
400 (p->urld.visits > 0))) {
401 fprintf(fp, "%s\n", lwc_string_data(p->scheme));
402
403 if (p->port) {
404 fprintf(fp,"%d\n", p->port);
405 } else {
406 fprintf(fp, "\n");
407 }
408
409 fprintf(fp, "%s\n", *path);
410
411 /** \todo handle fragments? */
412
413 /* number of visits */
414 fprintf(fp, "%i\n", p->urld.visits);
415
416 /* time entry was last used */
418
419 /* entry type */
420 fprintf(fp, "%i\n", (int)p->urld.type);
421
422 fprintf(fp, "\n");
423
424 if (p->urld.title) {
425 uint8_t *s = (uint8_t *) p->urld.title;
426
427 for (i = 0; s[i] != '\0'; i++)
428 if (s[i] < 32)
429 s[i] = ' ';
430 for (--i; ((i > 0) && (s[i] == ' '));
431 i--)
432 s[i] = '\0';
433 fprintf(fp, "%s\n", p->urld.title);
434 } else {
435 fprintf(fp, "\n");
436 }
437 }
438
439 /* Now, find next node to process. */
440 while (p != parent) {
441 int seglen = p->segment != NULL
442 ? strlen(p->segment) : 0;
443
444 /* Remove our segment from the path */
445 *path_used -= seglen;
446 (*path)[*path_used - 1] = '\0';
447
448 if (p->next != NULL) {
449 /* Have a sibling, process that */
450 p = p->next;
451 break;
452 }
453
454 /* Going up, so remove '/' */
455 *path_used -= 1;
456 (*path)[*path_used - 1] = '\0';
457
458 /* Ascend tree */
459 p = p->parent;
460 }
461 }
462 } while (p != parent);
463}
464
465
466/**
467 * Count number of URLs associated with a host
468 *
469 * \param root Root of path data tree
470 * \param expiry Expiry time for URLs
471 * \param count Pointer to count
472 */
473static void
475 time_t expiry,
476 unsigned int *count)
477{
478 const struct path_data *p = root;
479
480 do {
481 if (p->children != NULL) {
482 /* Drill down into children */
483 p = p->children;
484 } else {
485 /* No more children, increment count if required */
486 if (p->persistent ||
487 ((p->urld.last_visit > expiry) &&
488 (p->urld.visits > 0))) {
489 (*count)++;
490 }
491
492 /* Now, find next node to process. */
493 while (p != root) {
494 if (p->next != NULL) {
495 /* Have a sibling, process that */
496 p = p->next;
497 break;
498 }
499
500 /* Ascend tree */
501 p = p->parent;
502 }
503 }
504 } while (p != root);
505}
506
507
508/**
509 * Save a search (sub)tree
510 *
511 * \param parent root node of search tree to save.
512 * \param fp File to write to
513 */
514static void urldb_save_search_tree(struct search_node *parent, FILE *fp)
515{
516 char host[256];
517 const struct host_part *h;
518 unsigned int path_count = 0;
519 char *path, *p, *end;
520 int path_alloc = 64, path_used = 1;
521 time_t expiry, hsts_expiry = 0;
522 int hsts_include_subdomains = 0;
523
524 expiry = time(NULL) - ((60 * 60 * 24) * nsoption_int(expire_url));
525
526 if (parent == &empty)
527 return;
528
530
531 path = malloc(path_alloc);
532 if (!path)
533 return;
534
535 path[0] = '\0';
536
537 for (h = parent->data, p = host, end = host + sizeof host;
538 h && h != &db_root && p < end; h = h->parent) {
539 int written = snprintf(p, end - p, "%s%s", h->part,
540 (h->parent && h->parent->parent) ? "." : "");
541 if (written < 0) {
542 free(path);
543 return;
544 }
545 p += written;
546 }
547
548 h = parent->data;
549 if (h && h->hsts.expires > expiry) {
550 hsts_expiry = h->hsts.expires;
551 hsts_include_subdomains = h->hsts.include_sub_domains;
552 }
553
554 urldb_count_urls(&parent->data->paths, expiry, &path_count);
555
556 if (path_count > 0) {
557 fprintf(fp, "%s %i ", host, hsts_include_subdomains);
558 urldb_write_timet(fp, hsts_expiry);
559 fprintf(fp, "%i\n", path_count);
560
561 urldb_write_paths(&parent->data->paths, host, fp,
562 &path, &path_alloc, &path_used, expiry);
563 } else if (hsts_expiry) {
564 fprintf(fp, "%s %i ", host, hsts_include_subdomains);
565 urldb_write_timet(fp, hsts_expiry);
566 fprintf(fp, "0\n");
567 }
568
569 free(path);
570
571 urldb_save_search_tree(parent->right, fp);
572}
573
574
575/**
576 * Path data iterator (internal)
577 *
578 * \param parent Root of subtree to iterate over
579 * \param url_callback Callback function
580 * \param cookie_callback Callback function
581 * \return true to continue, false otherwise
582 */
583static bool
585 bool (*url_callback)(nsurl *url, const struct url_data *data),
586 bool (*cookie_callback)(const struct cookie_data *data))
587{
588 const struct path_data *p = parent;
589 const struct cookie_data *c;
590
591 do {
592 if (p->children != NULL) {
593 /* Drill down into children */
594 p = p->children;
595 } else {
596 /* All leaf nodes in the path tree should have an URL or
597 * cookies attached to them. If this is not the case, it
598 * indicates that there's a bug in the file loader/URL
599 * insertion code. Therefore, assert this here. */
600 assert(url_callback || cookie_callback);
601
602 /** \todo handle fragments? */
603 if (url_callback) {
604 const struct url_internal_data *u = &p->urld;
605
606 assert(p->url);
607
608 if (!url_callback(p->url,
609 (const struct url_data *) u))
610 return false;
611 } else {
612 c = (const struct cookie_data *)p->cookies;
613 for (; c != NULL; c = c->next) {
614 if (!cookie_callback(c))
615 return false;
616 }
617 }
618
619 /* Now, find next node to process. */
620 while (p != parent) {
621 if (p->next != NULL) {
622 /* Have a sibling, process that */
623 p = p->next;
624 break;
625 }
626
627 /* Ascend tree */
628 p = p->parent;
629 }
630 }
631 } while (p != parent);
632
633 return true;
634}
635
636
637/**
638 * Check whether a host string is an IP address.
639 *
640 * This call detects IPv4 addresses (all of dotted-quad or subsets,
641 * decimal or hexadecimal notations) and IPv6 addresses (including
642 * those containing embedded IPv4 addresses.)
643 *
644 * \param host a hostname terminated by '\0'
645 * \return true if the hostname is an IP address, false otherwise
646 */
647static bool urldb__host_is_ip_address(const char *host)
648{
649 struct in_addr ipv4;
650 size_t host_len = strlen(host);
651 const char *sane_host;
652 const char *slash;
653#ifndef NO_IPV6
654 struct in6_addr ipv6;
655 char ipv6_addr[64];
656 unsigned int ipv6_addr_len;
657#endif
658 /**
659 * @todo FIXME Some parts of urldb.c make confusions between hosts
660 * and "prefixes", we can sometimes be erroneously passed more than
661 * just a host. Sometimes we may be passed trailing slashes, or even
662 * whole path segments. A specific criminal in this class is
663 * urldb_iterate_partial, which takes a prefix to search for, but
664 * passes that prefix to functions that expect only hosts.
665 *
666 * For the time being, we will accept such calls; we check if there
667 * is a / in the host parameter, and if there is, we take a copy and
668 * replace the / with a \0. This is not a permanent solution; we
669 * should search through NetSurf and find all the callers that are
670 * in error and fix them. When doing this task, it might be wise
671 * to replace the hideousness below with code that doesn't have to do
672 * this, and add assert(strchr(host, '/') == NULL); somewhere.
673 * -- rjek - 2010-11-04
674 */
675
676 slash = strchr(host, '/');
677 if (slash == NULL) {
678 sane_host = host;
679 } else {
680 char *c = strdup(host);
681 c[slash - host] = '\0';
682 sane_host = c;
683 host_len = slash - host;
684 NSLOG(netsurf, INFO, "WARNING: called with non-host '%s'",
685 host);
686 }
687
688 if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
689 goto out_false;
690
691 if (inet_aton(sane_host, &ipv4) != 0) {
692 /* This can only be a sane IPv4 address if it contains 3 dots.
693 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
694 * and "a.b.c.d" as valid IPv4 address strings where we only
695 * support the full, dotted-quad, form.
696 */
697 int num_dots = 0;
698 size_t index;
699
700 for (index = 0; index < host_len; index++) {
701 if (sane_host[index] == '.')
702 num_dots++;
703 }
704
705 if (num_dots == 3)
706 goto out_true;
707 else
708 goto out_false;
709 }
710
711#ifndef NO_IPV6
712 if ((host_len < 6) ||
713 (sane_host[0] != '[') ||
714 (sane_host[host_len - 1] != ']')) {
715 goto out_false;
716 }
717
718 ipv6_addr_len = host_len - 2;
719 if (ipv6_addr_len >= sizeof(ipv6_addr)) {
720 ipv6_addr_len = sizeof(ipv6_addr) - 1;
721 }
722 strncpy(ipv6_addr, sane_host + 1, ipv6_addr_len);
723 ipv6_addr[ipv6_addr_len] = '\0';
724
725 if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
726 goto out_true;
727#endif
728
729out_false:
730 if (slash != NULL) free((void *)sane_host);
731 return false;
732
733out_true:
734 if (slash != NULL) free((void *)sane_host);
735 return true;
736}
737
738
739/**
740 * Compare host_part with prefix
741 *
742 * \param a host part
743 * \param b prefix
744 * \return 0 if match, non-zero, otherwise
745 */
746static int urldb_search_match_prefix(const struct host_part *a, const char *b)
747{
748 const char *end, *dot;
749 int plen, ret;
750
751 assert(a && a != &db_root && b);
752
754 /* IP address */
755 return strncasecmp(a->part, b, strlen(b));
756 }
757
758 end = b + strlen(b) + 1;
759
760 while (b < end && a && a != &db_root) {
761 dot = strchr(b, '.');
762 if (!dot) {
763 /* last segment */
764 dot = end - 1;
765 }
766
767 /* Compare strings (length limited) */
768 if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
769 /* didn't match => return difference */
770 return ret;
771
772 /* The strings matched */
773 if (dot < end - 1) {
774 /* Consider segment lengths only in the case
775 * where the prefix contains segments */
776 plen = strlen(a->part);
777 if (plen > dot - b) {
778 /* len(a) > len(b) */
779 return 1;
780 } else if (plen < dot - b) {
781 /* len(a) < len(b) */
782 return -1;
783 }
784 }
785
786 b = dot + 1;
787 a = a->parent;
788 }
789
790 /* If we get here then either:
791 * a) The path lengths differ
792 * or b) The hosts are identical
793 */
794 if (a && a != &db_root && b >= end) {
795 /* len(a) > len(b) => prefix matches */
796 return 0;
797 } else if ((!a || a == &db_root) && b < end) {
798 /* len(a) < len(b) => prefix does not match */
799 return -1;
800 }
801
802 /* Identical */
803 return 0;
804}
805
806
807/**
808 * Partial host iterator (internal)
809 *
810 * \param root Root of (sub)tree to traverse
811 * \param prefix Prefix to match
812 * \param callback Callback function
813 * \return true to continue, false otherwise
814 */
815static bool
817 const char *prefix,
818 bool (*callback)(nsurl *url, const struct url_data *data))
819{
820 int c;
821
822 assert(root && prefix && callback);
823
824 if (root == &empty)
825 return true;
826
827 c = urldb_search_match_prefix(root->data, prefix);
828
829 if (c > 0) {
830 /* No match => look in left subtree */
831 return urldb_iterate_partial_host(root->left,
832 prefix,
833 callback);
834 } else if (c < 0) {
835 /* No match => look in right subtree */
836 return urldb_iterate_partial_host(root->right,
837 prefix,
838 callback);
839 } else {
840 /* Match => iterate over l/r subtrees & process this node */
842 prefix,
843 callback)) {
844 return false;
845 }
846
847 if (root->data->paths.children) {
848 /* and extract all paths attached to this host */
849 if (!urldb_iterate_entries_path(&root->data->paths,
850 callback,
851 NULL)) {
852 return false;
853 }
854 }
855
857 prefix,
858 callback)) {
859 return false;
860 }
861 }
862
863 return true;
864}
865
866
867/**
868 * Partial path iterator (internal)
869 *
870 * Given: http://www.example.org/a/b/c/d//e
871 * and assuming a path tree:
872 * ^
873 * / \
874 * a1 b1
875 * / \
876 * a2 b2
877 * /|\
878 * a b c
879 * 3 3 |
880 * d
881 * |
882 * e
883 * / \
884 * f g
885 *
886 * Prefix will be: p will be:
887 *
888 * a/b/c/d//e a1
889 * b/c/d//e a2
890 * b/c/d//e b3
891 * c/d//e a3
892 * c/d//e b3
893 * c/d//e c
894 * d//e d
895 * /e e (skip /)
896 * e e
897 *
898 * I.E. perform a breadth-first search of the tree.
899 *
900 * \param parent Root of (sub)tree to traverse
901 * \param prefix Prefix to match
902 * \param callback Callback function
903 * \return true to continue, false otherwise
904 */
905static bool
907 const char *prefix,
908 bool (*callback)(nsurl *url, const struct url_data *data))
909{
910 const struct path_data *p = parent->children;
911 const char *slash, *end = prefix + strlen(prefix);
912
913 do {
914 slash = strchr(prefix, '/');
915 if (!slash) {
916 slash = end;
917 }
918
919 if (slash == prefix && *prefix == '/') {
920 /* Ignore "//" */
921 prefix++;
922 continue;
923 }
924
925 if (strncasecmp(p->segment, prefix, slash - prefix) == 0) {
926 /* prefix matches so far */
927 if (slash == end) {
928 /* we've run out of prefix, so all
929 * paths below this one match */
931 callback,
932 NULL)) {
933 return false;
934 }
935
936 /* Progress to next sibling */
937 p = p->next;
938 } else {
939 /* Skip over this segment */
940 prefix = slash + 1;
941
942 p = p->children;
943 }
944 } else {
945 /* Doesn't match this segment, try next sibling */
946 p = p->next;
947 }
948 } while (p != NULL);
949
950 return true;
951}
952
953
954/**
955 * Host data iterator (internal)
956 *
957 * \param parent Root of subtree to iterate over
958 * \param url_callback Callback function
959 * \param cookie_callback Callback function
960 * \return true to continue, false otherwise
961 */
962static bool
964 bool (*url_callback)(nsurl *url, const struct url_data *data),
965 bool (*cookie_callback)(const struct cookie_data *data))
966{
967 if (parent == &empty) {
968 return true;
969 }
970
972 url_callback,
973 cookie_callback)) {
974 return false;
975 }
976
977 if ((parent->data->paths.children) ||
978 ((cookie_callback) &&
979 (parent->data->paths.cookies))) {
980 /* We have paths (or domain cookies), so iterate them */
981 if (!urldb_iterate_entries_path(&parent->data->paths,
982 url_callback,
983 cookie_callback)) {
984 return false;
985 }
986 }
987
989 url_callback,
990 cookie_callback)) {
991 return false;
992 }
993
994 return true;
995}
996
997
998/**
999 * Add a host node to the tree
1000 *
1001 * \param part Host segment to add (or whole IP address) (copied)
1002 * \param parent Parent node to add to
1003 * \return Pointer to added node, or NULL on memory exhaustion
1004 */
1005static struct host_part *
1007{
1008 struct host_part *d;
1009
1010 assert(part && parent);
1011
1012 d = calloc(1, sizeof(struct host_part));
1013 if (!d) {
1014 return NULL;
1015 }
1016
1017 d->part = strdup(part);
1018 if (!d->part) {
1019 free(d);
1020 return NULL;
1021 }
1022
1023 d->next = parent->children;
1024 if (parent->children) {
1025 parent->children->prev = d;
1026 }
1027 d->parent = parent;
1028 parent->children = d;
1029
1030 return d;
1031}
1032
1033
1034/**
1035 * Fragment comparator callback for qsort
1036 *
1037 * \param a first value
1038 * \param b second value
1039 * \return 0 for equal else positive or negative value on comparison
1040 */
1041static int urldb_add_path_fragment_cmp(const void *a, const void *b)
1042{
1043 return strcasecmp(*((const char **) a), *((const char **) b));
1044}
1045
1046
1047/**
1048 * Add a fragment to a path segment
1049 *
1050 * \param segment Path segment to add to
1051 * \param fragment Fragment to add (copied), or NULL
1052 * \return segment or NULL on memory exhaustion
1053 */
1054static struct path_data *
1056{
1057 char **temp;
1058
1059 assert(segment);
1060
1061 /* If no fragment, this function is a NOP
1062 * This may seem strange, but it makes the rest
1063 * of the code cleaner */
1064 if (!fragment)
1065 return segment;
1066
1067 temp = realloc(segment->fragment,
1068 (segment->frag_cnt + 1) * sizeof(char *));
1069 if (!temp)
1070 return NULL;
1071
1072 segment->fragment = temp;
1073 segment->fragment[segment->frag_cnt] =
1074 strdup(lwc_string_data(fragment));
1075 if (!segment->fragment[segment->frag_cnt]) {
1076 /* Don't free temp - it's now our buffer */
1077 return NULL;
1078 }
1079
1080 segment->frag_cnt++;
1081
1082 /* We want fragments in alphabetical order, so sort them
1083 * It may prove better to insert in alphabetical order instead */
1084 qsort(segment->fragment,
1085 segment->frag_cnt,
1086 sizeof (char *),
1088
1089 return segment;
1090}
1091
1092
1093/**
1094 * Add a path node to the tree
1095 *
1096 * \param scheme URL scheme associated with path (copied)
1097 * \param port Port number on host associated with path
1098 * \param segment Path segment to add (copied)
1099 * \param fragment URL fragment (copied), or NULL
1100 * \param parent Parent node to add to
1101 * \return Pointer to added node, or NULL on memory exhaustion
1102 */
1103static struct path_data *
1105 unsigned int port,
1106 const char *segment,
1107 lwc_string *fragment,
1108 struct path_data *parent)
1109{
1110 struct path_data *d, *e;
1111
1112 assert(scheme && segment && parent);
1113
1114 d = calloc(1, sizeof(struct path_data));
1115 if (!d)
1116 return NULL;
1117
1118 d->scheme = lwc_string_ref(scheme);
1119
1120 d->port = port;
1121
1122 d->segment = strdup(segment);
1123 if (!d->segment) {
1124 lwc_string_unref(d->scheme);
1125 free(d);
1126 return NULL;
1127 }
1128
1129 if (fragment) {
1131 free(d->segment);
1132 lwc_string_unref(d->scheme);
1133 free(d);
1134 return NULL;
1135 }
1136 }
1137
1138 for (e = parent->children; e; e = e->next) {
1139 if (strcmp(e->segment, d->segment) > 0)
1140 break;
1141 }
1142
1143 if (e) {
1144 d->prev = e->prev;
1145 d->next = e;
1146 if (e->prev)
1147 e->prev->next = d;
1148 else
1149 parent->children = d;
1150 e->prev = d;
1151 } else if (!parent->children) {
1152 d->prev = d->next = NULL;
1153 parent->children = parent->last = d;
1154 } else {
1155 d->next = NULL;
1156 d->prev = parent->last;
1157 parent->last->next = d;
1158 parent->last = d;
1159 }
1160 d->parent = parent;
1161
1162 return d;
1163}
1164
1165
1166/**
1167 * Get the search tree for a particular host
1168 *
1169 * \param host the host to lookup
1170 * \return the corresponding search tree
1171 */
1172static struct search_node **urldb_get_search_tree_direct(const char *host)
1173{
1174 assert(host);
1175
1176 if (urldb__host_is_ip_address(host)) {
1177 return &search_trees[ST_IP];
1178 } else if (ascii_is_alpha(*host)) {
1179 return &search_trees[ST_DN + ascii_to_lower(*host) - 'a'];
1180 }
1181 return &search_trees[ST_EE];
1182}
1183
1184
1185/**
1186 * Get the search tree for a particular host
1187 *
1188 * \param host the host to lookup
1189 * \return the corresponding search tree
1190 */
1191static struct search_node *urldb_get_search_tree(const char *host)
1192{
1193 return *urldb_get_search_tree_direct(host);
1194}
1195
1196
1197/**
1198 * Compare host part with a string
1199 *
1200 * \param a host part
1201 * \param b string to compare
1202 * \return 0 if match, non-zero, otherwise
1203 */
1204static int urldb_search_match_string(const struct host_part *a, const char *b)
1205{
1206 const char *end, *dot;
1207 int plen, ret;
1208
1209 assert(a && a != &db_root && b);
1210
1212 /* IP address */
1213 return strcasecmp(a->part, b);
1214 }
1215
1216 end = b + strlen(b) + 1;
1217
1218 while (b < end && a && a != &db_root) {
1219 dot = strchr(b, '.');
1220 if (!dot) {
1221 /* last segment */
1222 dot = end - 1;
1223 }
1224
1225 /* Compare strings (length limited) */
1226 if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
1227 /* didn't match => return difference */
1228 return ret;
1229
1230 /* The strings matched, now check that the lengths do, too */
1231 plen = strlen(a->part);
1232
1233 if (plen > dot - b) {
1234 /* len(a) > len(b) */
1235 return 1;
1236 } else if (plen < dot - b) {
1237 /* len(a) < len(b) */
1238 return -1;
1239 }
1240
1241 b = dot + 1;
1242 a = a->parent;
1243 }
1244
1245 /* If we get here then either:
1246 * a) The path lengths differ
1247 * or b) The hosts are identical
1248 */
1249 if (a && a != &db_root && b >= end) {
1250 /* len(a) > len(b) */
1251 return 1;
1252 } else if ((!a || a == &db_root) && b < end) {
1253 /* len(a) < len(b) */
1254 return -1;
1255 }
1256
1257 /* Identical */
1258 return 0;
1259}
1260
1261
1262/**
1263 * Find a node in a search tree
1264 *
1265 * \param root Tree to look in
1266 * \param host Host to find
1267 * \return Pointer to host tree node, or NULL if not found
1268 */
1269static const struct host_part *
1270urldb_search_find(struct search_node *root, const char *host)
1271{
1272 int c;
1273
1274 assert(root && host);
1275
1276 if (root == &empty) {
1277 return NULL;
1278 }
1279
1280 c = urldb_search_match_string(root->data, host);
1281
1282 if (c > 0) {
1283 return urldb_search_find(root->left, host);
1284 } else if (c < 0) {
1285 return urldb_search_find(root->right, host);
1286 }
1287
1288 return root->data;
1289}
1290
1291
1292/**
1293 * Match a path string
1294 *
1295 * \param parent Path (sub)tree to look in
1296 * \param path The path to search for
1297 * \param scheme The URL scheme associated with the path
1298 * \param port The port associated with the path
1299 * \return Pointer to path data or NULL if not found.
1300 */
1301static struct path_data *
1303 const char *path,
1304 lwc_string *scheme,
1305 unsigned short port)
1306{
1307 const struct path_data *p;
1308 const char *slash;
1309 bool match;
1310
1311 assert(parent != NULL);
1312 assert(parent->segment == NULL);
1313
1314 if (path[0] != '/') {
1315 NSLOG(netsurf, INFO, "path is %s", path);
1316 }
1317
1318 assert(path[0] == '/');
1319
1320 /* Start with children, as parent has no segment */
1321 p = parent->children;
1322
1323 while (p != NULL) {
1324 slash = strchr(path + 1, '/');
1325 if (!slash) {
1326 slash = path + strlen(path);
1327 }
1328
1329 if (strncmp(p->segment, path + 1, slash - path - 1) == 0 &&
1330 lwc_string_isequal(p->scheme, scheme, &match) == lwc_error_ok &&
1331 match == true &&
1332 p->port == port) {
1333 if (*slash == '\0') {
1334 /* Complete match */
1335 return (struct path_data *) p;
1336 }
1337
1338 /* Match so far, go down tree */
1339 p = p->children;
1340
1341 path = slash;
1342 } else {
1343 /* No match, try next sibling */
1344 p = p->next;
1345 }
1346 }
1347
1348 return NULL;
1349}
1350
1351
1352/**
1353 * Find an URL in the database
1354 *
1355 * \param url Absolute URL to find
1356 * \return Pointer to path data, or NULL if not found
1357 */
1359{
1360 const struct host_part *h;
1361 struct path_data *p;
1362 struct search_node *tree;
1363 char *plq;
1364 const char *host_str;
1365 lwc_string *scheme, *host, *port;
1366 size_t len = 0;
1367 unsigned int port_int;
1368 bool match;
1369
1370 assert(url);
1371
1372 if (url_bloom != NULL) {
1373 if (bloom_search_hash(url_bloom, nsurl_hash(url)) == false) {
1374 return NULL;
1375 }
1376 }
1377
1378 scheme = nsurl_get_component(url, NSURL_SCHEME);
1379 if (scheme == NULL)
1380 return NULL;
1381
1382 if (lwc_string_isequal(scheme, corestring_lwc_mailto, &match) ==
1383 lwc_error_ok && match == true) {
1384 lwc_string_unref(scheme);
1385 return NULL;
1386 }
1387
1388 host = nsurl_get_component(url, NSURL_HOST);
1389 if (host != NULL) {
1390 host_str = lwc_string_data(host);
1391 lwc_string_unref(host);
1392
1393 } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
1394 lwc_error_ok && match == true) {
1395 host_str = "localhost";
1396
1397 } else {
1398 lwc_string_unref(scheme);
1399 return NULL;
1400 }
1401
1402 tree = urldb_get_search_tree(host_str);
1403 h = urldb_search_find(tree, host_str);
1404 if (!h) {
1405 lwc_string_unref(scheme);
1406 return NULL;
1407 }
1408
1409 /* generate plq (path, leaf, query) */
1410 if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &plq, &len) != NSERROR_OK) {
1411 lwc_string_unref(scheme);
1412 return NULL;
1413 }
1414
1415 /* Get port */
1416 port = nsurl_get_component(url, NSURL_PORT);
1417 if (port != NULL) {
1418 port_int = atoi(lwc_string_data(port));
1419 lwc_string_unref(port);
1420 } else {
1421 port_int = 0;
1422 }
1423
1424 p = urldb_match_path(&h->paths, plq, scheme, port_int);
1425
1426 free(plq);
1427 lwc_string_unref(scheme);
1428
1429 return p;
1430}
1431
1432
1433/**
1434 * Dump URL database paths to stderr
1435 *
1436 * \param parent Parent node of tree to dump
1437 */
1439{
1440 const struct path_data *p = parent;
1441 unsigned int i;
1442
1443 do {
1444 if (p->segment != NULL) {
1445 NSLOG(netsurf, INFO, "\t%s : %u",
1446 lwc_string_data(p->scheme), p->port);
1447
1448 NSLOG(netsurf, INFO, "\t\t'%s'", p->segment);
1449
1450 for (i = 0; i != p->frag_cnt; i++) {
1451 NSLOG(netsurf, INFO, "\t\t\t#%s",
1452 p->fragment[i]);
1453 }
1454 }
1455
1456 if (p->children != NULL) {
1457 p = p->children;
1458 } else {
1459 while (p != parent) {
1460 if (p->next != NULL) {
1461 p = p->next;
1462 break;
1463 }
1464
1465 p = p->parent;
1466 }
1467 }
1468 } while (p != parent);
1469}
1470
1471
1472/**
1473 * Dump URL database hosts to stderr
1474 *
1475 * \param parent Parent node of tree to dump
1476 */
1478{
1479 struct host_part *h;
1480
1481 if (parent->part) {
1482 NSLOG(netsurf, INFO, "%s", parent->part);
1483
1484 NSLOG(netsurf, INFO, "\t%s invalid SSL certs",
1485 parent->permit_invalid_certs ? "Permits" : "Denies");
1486 }
1487
1488 /* Dump path data */
1489 urldb_dump_paths(&parent->paths);
1490
1491 /* and recurse */
1492 for (h = parent->children; h; h = h->next) {
1494 }
1495}
1496
1497
1498/**
1499 * Dump search tree
1500 *
1501 * \param parent Parent node of tree to dump
1502 * \param depth Tree depth
1503 */
1504static void urldb_dump_search(struct search_node *parent, int depth)
1505{
1506 const struct host_part *h;
1507 int i; /* index into string */
1508 char s[1024];
1509 int r;
1510 int sl = sizeof(s) - 2;
1511
1512 if (parent == &empty)
1513 return;
1514
1515 urldb_dump_search(parent->left, depth + 1);
1516
1517 for (i = 0; i != depth; i++) {
1518 s[i] = ' ';
1519 }
1520
1521 for (h = parent->data; h; h = h->parent) {
1522 if (h->part) {
1523 r = snprintf(&s[i], sl - i, "%s", h->part);
1524 if (r < 0) {
1525 break;
1526 }
1527 if ((i + r) >= sl) {
1528 break;
1529 }
1530 i += r;
1531 }
1532
1533 if (h->parent && h->parent->parent) {
1534 s[i]='.';
1535 i++;
1536 }
1537 }
1538 s[i]= 0;
1539
1540 NSLOG(netsurf, INFO, "%s", s);
1541
1542 urldb_dump_search(parent->right, depth + 1);
1543}
1544
1545
1546/**
1547 * Compare a pair of host parts
1548 *
1549 * \param a first host part
1550 * \param b second host part
1551 * \return 0 if match, non-zero, otherwise
1552 */
1553static int
1554urldb_search_match_host(const struct host_part *a, const struct host_part *b)
1555{
1556 int ret;
1557
1558 assert(a && b);
1559
1560 /* traverse up tree to root, comparing parts as we go. */
1561 for (; a && a != &db_root && b && b != &db_root;
1562 a = a->parent, b = b->parent) {
1563 if ((ret = strcasecmp(a->part, b->part)) != 0) {
1564 /* They differ => return the difference here */
1565 return ret;
1566 }
1567 }
1568
1569 /* If we get here then either:
1570 * a) The path lengths differ
1571 * or b) The hosts are identical
1572 */
1573 if (a && a != &db_root && (!b || b == &db_root)) {
1574 /* len(a) > len(b) */
1575 return 1;
1576 } else if ((!a || a == &db_root) && b && b != &db_root) {
1577 /* len(a) < len(b) */
1578 return -1;
1579 }
1580
1581 /* identical */
1582 return 0;
1583}
1584
1585
1586/**
1587 * Rotate a subtree right
1588 *
1589 * \param root Root of subtree to rotate
1590 * \return new root of subtree
1591 */
1593{
1594 assert(root);
1595
1596 if (root->left->level == root->level) {
1597 struct search_node *temp;
1598
1599 temp = root->left;
1600 root->left = temp->right;
1601 temp->right = root;
1602 root = temp;
1603 }
1604
1605 return root;
1606}
1607
1608
1609/**
1610 * Rotate a node left, increasing the parent's level
1611 *
1612 * \param root Root of subtree to rotate
1613 * \return New root of subtree
1614 */
1616{
1617 assert(root);
1618
1619 if (root->right->right->level == root->level) {
1620 struct search_node *temp;
1621
1622 temp = root->right;
1623 root->right = temp->left;
1624 temp->left = root;
1625 root = temp;
1626
1627 root->level++;
1628 }
1629
1630 return root;
1631}
1632
1633
1634/**
1635 * Insert node into search tree
1636 *
1637 * \param root Root of (sub)tree to insert into
1638 * \param n Node to insert
1639 * \return Pointer to updated root
1640 */
1641static struct search_node *
1643{
1644 assert(root && n);
1645
1646 if (root == &empty) {
1647 root = n;
1648 } else {
1649 int c = urldb_search_match_host(root->data, n->data);
1650
1651 if (c > 0) {
1653 root->left, n);
1654 } else if (c < 0) {
1656 root->right, n);
1657 } else {
1658 /* exact match */
1659 free(n);
1660 return root;
1661 }
1662
1665 }
1666
1667 return root;
1668}
1669
1670
1671/**
1672 * Insert a node into the search tree
1673 *
1674 * \param root Root of tree to insert into
1675 * \param data User data to insert
1676 * \return Pointer to updated root, or NULL if failed
1677 */
1678static struct search_node *
1680{
1681 struct search_node *n;
1682
1683 assert(root && data);
1684
1685 n = malloc(sizeof(struct search_node));
1686 if (!n)
1687 return NULL;
1688
1689 n->level = 1;
1690 n->data = data;
1691 n->left = n->right = &empty;
1692
1694
1695 return root;
1696}
1697
1698
1699/**
1700 * Parse a cookie avpair
1701 *
1702 * \param c Cookie struct to populate
1703 * \param n Name component
1704 * \param v Value component
1705 * \param was_quoted Whether \a v was quoted in the input
1706 * \return true on success, false on memory exhaustion
1707 */
1708static bool
1710 char *n,
1711 char *v,
1712 bool was_quoted)
1713{
1714 int vlen;
1715
1716 assert(c && n && v);
1717
1718 /* Strip whitespace from start of name */
1719 for (; *n; n++) {
1720 if (*n != ' ' && *n != '\t')
1721 break;
1722 }
1723
1724 /* Strip whitespace from end of name */
1725 for (vlen = strlen(n); vlen; vlen--) {
1726 if (n[vlen] == ' ' || n[vlen] == '\t')
1727 n[vlen] = '\0';
1728 else
1729 break;
1730 }
1731
1732 /* Strip whitespace from start of value */
1733 for (; *v; v++) {
1734 if (*v != ' ' && *v != '\t')
1735 break;
1736 }
1737
1738 /* Strip whitespace from end of value */
1739 for (vlen = strlen(v); vlen; vlen--) {
1740 if (v[vlen] == ' ' || v[vlen] == '\t')
1741 v[vlen] = '\0';
1742 else
1743 break;
1744 }
1745
1746 if (!c->comment && strcasecmp(n, "Comment") == 0) {
1747 c->comment = strdup(v);
1748 if (!c->comment)
1749 return false;
1750 } else if (!c->domain && strcasecmp(n, "Domain") == 0) {
1751 if (v[0] == '.') {
1752 /* Domain must start with a dot */
1753 c->domain_from_set = true;
1754 c->domain = strdup(v);
1755 if (!c->domain)
1756 return false;
1757 }
1758 } else if (strcasecmp(n, "Max-Age") == 0) {
1759 int temp = atoi(v);
1760 if (temp == 0)
1761 /* Special case - 0 means delete */
1762 c->expires = 0;
1763 else
1764 c->expires = time(NULL) + temp;
1765 } else if (!c->path && strcasecmp(n, "Path") == 0) {
1766 c->path_from_set = true;
1767 c->path = strdup(v);
1768 if (!c->path)
1769 return false;
1770 } else if (strcasecmp(n, "Version") == 0) {
1771 c->version = atoi(v);
1772 } else if (strcasecmp(n, "Expires") == 0) {
1773 char *datenoday;
1774 time_t expires;
1775 nserror res;
1776
1777 /* Strip dayname from date (these are hugely variable
1778 * and liable to break the parser. They also serve no
1779 * useful purpose) */
1780 for (datenoday = v;
1781 *datenoday && !ascii_is_digit(*datenoday);
1782 datenoday++) {
1783 /* do nothing */
1784 }
1785
1786 res = nsc_strntimet(datenoday, strlen(datenoday), &expires);
1787 if (res != NSERROR_OK) {
1788 /* assume we have an unrepresentable date =>
1789 * force it to the maximum possible value of a
1790 * 32bit time_t (this may break in 2038. We'll
1791 * deal with that once we come to it) */
1792 expires = (time_t)0x7fffffff;
1793 }
1794 c->expires = expires;
1795 } else if (strcasecmp(n, "Secure") == 0) {
1796 c->secure = true;
1797 } else if (strcasecmp(n, "HttpOnly") == 0) {
1798 c->http_only = true;
1799 } else if (!c->name) {
1800 c->name = strdup(n);
1801 c->value = strdup(v);
1802 c->value_was_quoted = was_quoted;
1803 if (!c->name || !c->value) {
1804 return false;
1805 }
1806 }
1807
1808 return true;
1809}
1810
1811
1812/**
1813 * Free a cookie
1814 *
1815 * \param c The cookie to free
1816 */
1818{
1819 assert(c);
1820
1821 free(c->comment);
1822 free(c->domain);
1823 free(c->path);
1824 free(c->name);
1825 free(c->value);
1826 free(c);
1827}
1828
1829
1830/**
1831 * Parse a cookie
1832 *
1833 * \param url URL being fetched
1834 * \param cookie Pointer to cookie string (updated on exit)
1835 * \return Pointer to cookie structure (on heap, caller frees) or NULL
1836 */
1837static struct cookie_internal_data *
1838urldb_parse_cookie(nsurl *url, const char **cookie)
1839{
1840 struct cookie_internal_data *c;
1841 const char *cur;
1842 char name[1024], value[4096];
1843 char *n = name, *v = value;
1844 bool in_value = false;
1845 bool had_value_data = false;
1846 bool value_verbatim = false;
1847 bool quoted = false;
1848 bool was_quoted = false;
1849
1850 assert(url && cookie && *cookie);
1851
1852 c = calloc(1, sizeof(struct cookie_internal_data));
1853 if (c == NULL)
1854 return NULL;
1855
1856 c->expires = -1;
1857
1858 name[0] = '\0';
1859 value[0] = '\0';
1860
1861 for (cur = *cookie; *cur; cur++) {
1862 if (*cur == '\r' && *(cur + 1) == '\n') {
1863 /* End of header */
1864 if (quoted) {
1865 /* Unmatched quote encountered */
1866
1867 /* Match Firefox 2.0.0.11 */
1868 value[0] = '\0';
1869
1870 }
1871
1872 break;
1873 } else if (*cur == '\r') {
1874 /* Spurious linefeed */
1875 continue;
1876 } else if (*cur == '\n') {
1877 /* Spurious newline */
1878 continue;
1879 }
1880
1881 if (in_value && !had_value_data) {
1882 if (*cur == ' ' || *cur == '\t') {
1883 /* Strip leading whitespace from value */
1884 continue;
1885 } else {
1886 had_value_data = true;
1887
1888 /* Value is taken verbatim if first non-space
1889 * character is not a " */
1890 if (*cur != '"') {
1891 value_verbatim = true;
1892 }
1893 }
1894 }
1895
1896 if (in_value && !value_verbatim && (*cur == '"')) {
1897 /* Only non-verbatim values may be quoted */
1898 if (cur == *cookie || *(cur - 1) != '\\') {
1899 /* Only unescaped quotes count */
1900 was_quoted = quoted;
1901 quoted = !quoted;
1902
1903 continue;
1904 }
1905 }
1906
1907 if (!quoted && !in_value && *cur == '=') {
1908 /* First equals => attr-value separator */
1909 in_value = true;
1910 continue;
1911 }
1912
1913 if (!quoted && (was_quoted || *cur == ';')) {
1914 /* Semicolon or after quoted value
1915 * => end of current avpair */
1916
1917 /* NUL-terminate tokens */
1918 *n = '\0';
1919 *v = '\0';
1920
1921 if (!urldb_parse_avpair(c, name, value, was_quoted)) {
1922 /* Memory exhausted */
1924 return NULL;
1925 }
1926
1927 /* And reset to start */
1928 n = name;
1929 v = value;
1930 in_value = false;
1931 had_value_data = false;
1932 value_verbatim = false;
1933 was_quoted = false;
1934
1935 /* Now, if the current input is anything other than a
1936 * semicolon, we must be sure to reprocess it */
1937 if (*cur != ';') {
1938 cur--;
1939 }
1940
1941 continue;
1942 }
1943
1944 /* And now handle commas. These are a pain as they may mean
1945 * any of the following:
1946 *
1947 * + End of cookie
1948 * + Day separator in Expires avpair
1949 * + (Invalid) comma in unquoted value
1950 *
1951 * Therefore, in order to handle all 3 cases (2 and 3 are
1952 * identical, the difference being that 2 is in the spec and
1953 * 3 isn't), we need to determine where the comma actually
1954 * lies. We use the following heuristic:
1955 *
1956 * Given a comma at the current input position, find the
1957 * immediately following semicolon (or end of input if none
1958 * found). Then, consider the input characters between
1959 * these two positions. If any of these characters is an
1960 * '=', we must assume that the comma signified the end of
1961 * the current cookie.
1962 *
1963 * This holds as the first avpair of any cookie must be
1964 * NAME=VALUE, so the '=' is guaranteed to appear in the
1965 * case where the comma marks the end of a cookie.
1966 *
1967 * This will fail, however, in the case where '=' appears in
1968 * the value of the current avpair after the comma or the
1969 * subsequent cookie does not start with NAME=VALUE. Neither
1970 * of these is particularly likely and if they do occur, the
1971 * website is more broken than we can be bothered to handle.
1972 */
1973 if (!quoted && *cur == ',') {
1974 /* Find semi-colon, if any */
1975 const char *p;
1976 const char *semi = strchr(cur + 1, ';');
1977 if (!semi)
1978 semi = cur + strlen(cur) - 2 /* CRLF */;
1979
1980 /* Look for equals sign between comma and semi */
1981 for (p = cur + 1; p < semi; p++)
1982 if (*p == '=')
1983 break;
1984
1985 if (p == semi) {
1986 /* none found => comma internal to value */
1987 /* do nothing */
1988 } else {
1989 /* found one => comma marks end of cookie */
1990 cur++;
1991 break;
1992 }
1993 }
1994
1995 /* Accumulate into buffers, always leaving space for a NUL */
1996 /** \todo is silently truncating overlong names/values wise? */
1997 if (!in_value) {
1998 if (n < name + (sizeof(name) - 1))
1999 *n++ = *cur;
2000 } else {
2001 if (v < value + (sizeof(value) - 1))
2002 *v++ = *cur;
2003 }
2004 }
2005
2006 /* Parse final avpair */
2007 *n = '\0';
2008 *v = '\0';
2009
2010 if (!urldb_parse_avpair(c, name, value, was_quoted)) {
2011 /* Memory exhausted */
2013 return NULL;
2014 }
2015
2016 /* Now fix-up default values */
2017 if (c->domain == NULL) {
2018 lwc_string *host = nsurl_get_component(url, NSURL_HOST);
2019 if (host == NULL) {
2021 return NULL;
2022 }
2023 c->domain = strdup(lwc_string_data(host));
2024 lwc_string_unref(host);
2025 }
2026
2027 if (c->path == NULL) {
2028 const char *path_data;
2029 char *path, *slash;
2030 lwc_string *path_lwc;
2031
2032 path_lwc = nsurl_get_component(url, NSURL_PATH);
2033 if (path_lwc == NULL) {
2035 return NULL;
2036 }
2037 path_data = lwc_string_data(path_lwc);
2038
2039 /* Strip leafname and trailing slash (4.3.1) */
2040 slash = strrchr(path_data, '/');
2041 if (slash != NULL) {
2042 /* Special case: retain first slash in path */
2043 if (slash == path_data)
2044 slash++;
2045
2046 slash = strndup(path_data, slash - path_data);
2047 if (slash == NULL) {
2048 lwc_string_unref(path_lwc);
2050 return NULL;
2051 }
2052
2053 path = slash;
2054 lwc_string_unref(path_lwc);
2055 } else {
2056 path = strdup(lwc_string_data(path_lwc));
2057 lwc_string_unref(path_lwc);
2058 if (path == NULL) {
2060 return NULL;
2061 }
2062 }
2063
2064 c->path = path;
2065 }
2066
2067 /* Write back current position */
2068 *cookie = cur;
2069
2070 return c;
2071}
2072
2073
2074/**
2075 * Add a path to the database, creating any intermediate entries
2076 *
2077 * \param scheme URL scheme associated with path
2078 * \param port Port number on host associated with path
2079 * \param host Host tree node to attach to
2080 * \param path_query Absolute path plus query to add (freed)
2081 * \param fragment URL fragment, or NULL
2082 * \param url URL (fragment ignored)
2083 * \return Pointer to leaf node, or NULL on memory exhaustion
2084 */
2085static struct path_data *
2087 unsigned int port,
2088 const struct host_part *host,
2089 char *path_query,
2090 lwc_string *fragment,
2091 nsurl *url)
2092{
2093 struct path_data *d, *e;
2094 char *buf = path_query;
2095 char *segment, *slash;
2096 bool match;
2097
2098 assert(scheme && host && url);
2099
2100 d = (struct path_data *) &host->paths;
2101
2102 /* skip leading '/' */
2103 segment = buf;
2104 if (*segment == '/')
2105 segment++;
2106
2107 /* Process path segments */
2108 do {
2109 slash = strchr(segment, '/');
2110 if (!slash) {
2111 /* last segment */
2112 /* look for existing entry */
2113 for (e = d->children; e; e = e->next)
2114 if (strcmp(segment, e->segment) == 0 &&
2115 lwc_string_isequal(scheme,
2116 e->scheme, &match) ==
2117 lwc_error_ok &&
2118 match == true &&
2119 e->port == port)
2120 break;
2121
2122 d = e ? urldb_add_path_fragment(e, fragment) :
2124 segment, fragment, d);
2125 break;
2126 }
2127
2128 *slash = '\0';
2129
2130 /* look for existing entry */
2131 for (e = d->children; e; e = e->next)
2132 if (strcmp(segment, e->segment) == 0 &&
2133 lwc_string_isequal(scheme, e->scheme,
2134 &match) == lwc_error_ok &&
2135 match == true &&
2136 e->port == port)
2137 break;
2138
2139 d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d);
2140 if (!d)
2141 break;
2142
2143 segment = slash + 1;
2144 } while (1);
2145
2146 free(path_query);
2147
2148 if (d && !d->url) {
2149 /* Insert defragmented URL */
2150 if (nsurl_defragment(url, &d->url) != NSERROR_OK)
2151 return NULL;
2152 }
2153
2154 return d;
2155}
2156
2157
2158/**
2159 * Add a host to the database, creating any intermediate entries
2160 *
2161 * \param host Hostname to add
2162 * \return Pointer to leaf node, or NULL on memory exhaustion
2163 */
2164static struct host_part *urldb_add_host(const char *host)
2165{
2166 struct host_part *d = (struct host_part *) &db_root, *e;
2167 struct search_node *s;
2168 char buf[256]; /* 256 bytes is sufficient - domain names are
2169 * limited to 255 chars. */
2170 char *part;
2171
2172 assert(host);
2173
2174 if (urldb__host_is_ip_address(host)) {
2175 /* Host is an IP, so simply add as TLD */
2176
2177 /* Check for existing entry */
2178 for (e = d->children; e; e = e->next)
2179 if (strcasecmp(host, e->part) == 0)
2180 /* found => return it */
2181 return e;
2182
2183 d = urldb_add_host_node(host, d);
2184
2186 if (!s) {
2187 /* failed */
2188 d = NULL;
2189 } else {
2190 search_trees[ST_IP] = s;
2191 }
2192
2193 return d;
2194 }
2195
2196 /* Copy host string, so we can corrupt it */
2197 strncpy(buf, host, sizeof buf);
2198 buf[sizeof buf - 1] = '\0';
2199
2200 /* Process FQDN segments backwards */
2201 do {
2202 part = strrchr(buf, '.');
2203 if (!part) {
2204 /* last segment */
2205 /* Check for existing entry */
2206 for (e = d->children; e; e = e->next)
2207 if (strcasecmp(buf, e->part) == 0)
2208 break;
2209
2210 if (e) {
2211 d = e;
2212 } else {
2213 d = urldb_add_host_node(buf, d);
2214 }
2215
2216 /* And insert into search tree */
2217 if (d) {
2218 struct search_node **r;
2219
2221 s = urldb_search_insert(*r, d);
2222 if (!s) {
2223 /* failed */
2224 d = NULL;
2225 } else {
2226 *r = s;
2227 }
2228 }
2229 break;
2230 }
2231
2232 /* Check for existing entry */
2233 for (e = d->children; e; e = e->next)
2234 if (strcasecmp(part + 1, e->part) == 0)
2235 break;
2236
2237 d = e ? e : urldb_add_host_node(part + 1, d);
2238 if (!d)
2239 break;
2240
2241 *part = '\0';
2242 } while (1);
2243
2244 return d;
2245}
2246
2247
2248/**
2249 * Insert a cookie into the database
2250 *
2251 * \param c The cookie to insert
2252 * \param scheme URL scheme associated with cookie path
2253 * \param url URL (sans fragment) associated with cookie
2254 * \return true on success, false on memory exhaustion (c will be freed)
2255 */
2256static bool
2258 lwc_string *scheme,
2259 nsurl *url)
2260{
2261 struct cookie_internal_data *d;
2262 const struct host_part *h;
2263 struct path_data *p;
2264 time_t now = time(NULL);
2265
2266 assert(c);
2267
2268 if (c->domain[0] == '.') {
2270 urldb_get_search_tree(&(c->domain[1])),
2271 c->domain + 1);
2272 if (!h) {
2273 h = urldb_add_host(c->domain + 1);
2274 if (!h) {
2276 return false;
2277 }
2278 }
2279
2280 p = (struct path_data *) &h->paths;
2281 } else {
2282 /* Need to have a URL and scheme, if it's not a domain cookie */
2283 assert(url != NULL);
2284 assert(scheme != NULL);
2285
2288 c->domain);
2289
2290 if (!h) {
2291 h = urldb_add_host(c->domain);
2292 if (!h) {
2294 return false;
2295 }
2296 }
2297
2298 /* find path */
2299 p = urldb_add_path(scheme, 0, h,
2300 strdup(c->path), NULL, url);
2301 if (!p) {
2303 return false;
2304 }
2305 }
2306
2307 /* add cookie */
2308 for (d = p->cookies; d; d = d->next) {
2309 if (!strcmp(d->domain, c->domain) &&
2310 !strcmp(d->path, c->path) &&
2311 !strcmp(d->name, c->name))
2312 break;
2313 }
2314
2315 if (d) {
2316 if (c->expires != -1 && c->expires < now) {
2317 /* remove cookie */
2318 if (d->next)
2319 d->next->prev = d->prev;
2320 else
2321 p->cookies_end = d->prev;
2322 if (d->prev)
2323 d->prev->next = d->next;
2324 else
2325 p->cookies = d->next;
2326
2328
2331 } else {
2332 /* replace d with c */
2333 c->prev = d->prev;
2334 c->next = d->next;
2335 if (c->next)
2336 c->next->prev = c;
2337 else
2338 p->cookies_end = c;
2339 if (c->prev)
2340 c->prev->next = c;
2341 else
2342 p->cookies = c;
2343
2346
2347 cookie_manager_add((struct cookie_data *)c);
2348 }
2349 } else {
2350 c->prev = p->cookies_end;
2351 c->next = NULL;
2352 if (p->cookies_end)
2353 p->cookies_end->next = c;
2354 else
2355 p->cookies = c;
2356 p->cookies_end = c;
2357
2358 cookie_manager_add((struct cookie_data *)c);
2359 }
2360
2361 return true;
2362}
2363
2364
2365/**
2366 * Concatenate a cookie into the provided buffer
2367 *
2368 * \param c Cookie to concatenate
2369 * \param version The version of the cookie string to output
2370 * \param used Pointer to amount of buffer used (updated)
2371 * \param alloc Pointer to allocated size of buffer (updated)
2372 * \param buf Pointer to Pointer to buffer (updated)
2373 * \return true on success, false on memory exhaustion
2374 */
2375static bool
2377 int version,
2378 int *used,
2379 int *alloc,
2380 char **buf)
2381{
2382 /* Combined (A)BNF for the Cookie: request header:
2383 *
2384 * CHAR = <any US-ASCII character (octets 0 - 127)>
2385 * CTL = <any US-ASCII control character
2386 * (octets 0 - 31) and DEL (127)>
2387 * CR = <US-ASCII CR, carriage return (13)>
2388 * LF = <US-ASCII LF, linefeed (10)>
2389 * SP = <US-ASCII SP, space (32)>
2390 * HT = <US-ASCII HT, horizontal-tab (9)>
2391 * <"> = <US-ASCII double-quote mark (34)>
2392 *
2393 * CRLF = CR LF
2394 *
2395 * LWS = [CRLF] 1*( SP | HT )
2396 *
2397 * TEXT = <any OCTET except CTLs,
2398 * but including LWS>
2399 *
2400 * token = 1*<any CHAR except CTLs or separators>
2401 * separators = "(" | ")" | "<" | ">" | "@"
2402 * | "," | ";" | ":" | "\" | <">
2403 * | "/" | "[" | "]" | "?" | "="
2404 * | "{" | "}" | SP | HT
2405 *
2406 * quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
2407 * qdtext = <any TEXT except <">>
2408 * quoted-pair = "\" CHAR
2409 *
2410 * attr = token
2411 * value = word
2412 * word = token | quoted-string
2413 *
2414 * cookie = "Cookie:" cookie-version
2415 * 1*((";" | ",") cookie-value)
2416 * cookie-value = NAME "=" VALUE [";" path] [";" domain]
2417 * cookie-version = "$Version" "=" value
2418 * NAME = attr
2419 * VALUE = value
2420 * path = "$Path" "=" value
2421 * domain = "$Domain" "=" value
2422 *
2423 * A note on quoted-string handling:
2424 * The cookie data stored in the db is verbatim (i.e. sans enclosing
2425 * <">, if any, and with all quoted-pairs intact) thus all that we
2426 * need to do here is ensure that value strings which were quoted
2427 * in Set-Cookie or which include any of the separators are quoted
2428 * before use.
2429 *
2430 * A note on cookie-value separation:
2431 * We use semicolons for all separators, including between
2432 * cookie-values. This simplifies things and is backwards compatible.
2433 */
2434 const char * const separators = "()<>@,;:\\\"/[]?={} \t";
2435
2436 int max_len;
2437
2438 assert(c && used && alloc && buf && *buf);
2439
2440 /* "; " cookie-value
2441 * We allow for the possibility that values are quoted
2442 */
2443 max_len = 2 + strlen(c->name) + 1 + strlen(c->value) + 2 +
2444 (c->path_from_set ?
2445 8 + strlen(c->path) + 2 : 0) +
2446 (c->domain_from_set ?
2447 10 + strlen(c->domain) + 2 : 0);
2448
2449 if (*used + max_len >= *alloc) {
2450 char *temp = realloc(*buf, *alloc + 4096);
2451 if (!temp) {
2452 return false;
2453 }
2454 *buf = temp;
2455 *alloc += 4096;
2456 }
2457
2458 if (version == COOKIE_NETSCAPE) {
2459 /* Original Netscape cookie */
2460 sprintf(*buf + *used - 1, "; %s=", c->name);
2461 *used += 2 + strlen(c->name) + 1;
2462
2463 /* The Netscape spec doesn't mention quoting of cookie values.
2464 * RFC 2109 $10.1.3 indicates that values must not be quoted.
2465 *
2466 * However, other browsers preserve quoting, so we should, too
2467 */
2468 if (c->value_was_quoted) {
2469 sprintf(*buf + *used - 1, "\"%s\"", c->value);
2470 *used += 1 + strlen(c->value) + 1;
2471 } else {
2472 /** \todo should we %XX-encode [;HT,SP] ? */
2473 /** \todo Should we strip escaping backslashes? */
2474 sprintf(*buf + *used - 1, "%s", c->value);
2475 *used += strlen(c->value);
2476 }
2477
2478 /* We don't send path/domain information -- that's what the
2479 * Netscape spec suggests we should do, anyway. */
2480 } else {
2481 /* RFC2109 or RFC2965 cookie */
2482 sprintf(*buf + *used - 1, "; %s=", c->name);
2483 *used += 2 + strlen(c->name) + 1;
2484
2485 /* Value needs quoting if it contains any separator or if
2486 * it needs preserving from the Set-Cookie header */
2487 if (c->value_was_quoted ||
2488 strpbrk(c->value, separators) != NULL) {
2489 sprintf(*buf + *used - 1, "\"%s\"", c->value);
2490 *used += 1 + strlen(c->value) + 1;
2491 } else {
2492 sprintf(*buf + *used - 1, "%s", c->value);
2493 *used += strlen(c->value);
2494 }
2495
2496 if (c->path_from_set) {
2497 /* Path, quoted if necessary */
2498 sprintf(*buf + *used - 1, "; $Path=");
2499 *used += 8;
2500
2501 if (strpbrk(c->path, separators) != NULL) {
2502 sprintf(*buf + *used - 1, "\"%s\"", c->path);
2503 *used += 1 + strlen(c->path) + 1;
2504 } else {
2505 sprintf(*buf + *used - 1, "%s", c->path);
2506 *used += strlen(c->path);
2507 }
2508 }
2509
2510 if (c->domain_from_set) {
2511 /* Domain, quoted if necessary */
2512 sprintf(*buf + *used - 1, "; $Domain=");
2513 *used += 10;
2514
2515 if (strpbrk(c->domain, separators) != NULL) {
2516 sprintf(*buf + *used - 1, "\"%s\"", c->domain);
2517 *used += 1 + strlen(c->domain) + 1;
2518 } else {
2519 sprintf(*buf + *used - 1, "%s", c->domain);
2520 *used += strlen(c->domain);
2521 }
2522 }
2523 }
2524
2525 return true;
2526}
2527
2528
2529/**
2530 * deletes paths from a cookie.
2531 *
2532 * \param domain the cookie domain
2533 * \param path the cookie path
2534 * \param name The cookie name
2535 * \param parent The url data of the cookie
2536 */
2537static void
2538urldb_delete_cookie_paths(const char *domain,
2539 const char *path,
2540 const char *name,
2541 struct path_data *parent)
2542{
2543 struct cookie_internal_data *c;
2544 struct path_data *p = parent;
2545
2546 assert(parent);
2547
2548 do {
2549 for (c = p->cookies; c; c = c->next) {
2550 if (strcmp(c->domain, domain) == 0 &&
2551 strcmp(c->path, path) == 0 &&
2552 strcmp(c->name, name) == 0) {
2553 if (c->prev) {
2554 c->prev->next = c->next;
2555 } else {
2556 p->cookies = c->next;
2557 }
2558
2559 if (c->next) {
2560 c->next->prev = c->prev;
2561 } else {
2562 p->cookies_end = c->prev;
2563 }
2564
2566
2567 return;
2568 }
2569 }
2570
2571 if (p->children) {
2572 p = p->children;
2573 } else {
2574 while (p != parent) {
2575 if (p->next != NULL) {
2576 p = p->next;
2577 break;
2578 }
2579
2580 p = p->parent;
2581 }
2582 }
2583 } while (p != parent);
2584}
2585
2586
2587/**
2588 * Deletes cookie hosts and their assoicated paths
2589 *
2590 * \param domain the cookie domain
2591 * \param path the cookie path
2592 * \param name The cookie name
2593 * \param parent The url data of the cookie
2594 */
2595static void
2596urldb_delete_cookie_hosts(const char *domain,
2597 const char *path,
2598 const char *name,
2599 struct host_part *parent)
2600{
2601 struct host_part *h;
2602 assert(parent);
2603
2604 urldb_delete_cookie_paths(domain, path, name, &parent->paths);
2605
2606 for (h = parent->children; h; h = h->next) {
2607 urldb_delete_cookie_hosts(domain, path, name, h);
2608 }
2609}
2610
2611
2612/**
2613 * Save a path subtree's cookies
2614 *
2615 * \param fp File pointer to write to
2616 * \param parent Parent path
2617 */
2618static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent)
2619{
2620 struct path_data *p = parent;
2621 time_t now = time(NULL);
2622
2623 assert(fp && parent);
2624
2625 do {
2626 if (p->cookies != NULL) {
2627 struct cookie_internal_data *c;
2628
2629 for (c = p->cookies; c != NULL; c = c->next) {
2630 if (c->expires == -1 || c->expires < now) {
2631 /* Skip expired & session cookies */
2632 continue;
2633 }
2634
2635 fprintf(fp,
2636 "%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t"
2637 "%s\t%s\t%d\t%s\t%s\t%s\n",
2638 c->version, c->domain,
2639 c->domain_from_set, c->path,
2640 c->path_from_set, c->secure,
2641 c->http_only,
2642 (int)c->expires, (int)c->last_used,
2643 c->no_destroy, c->name, c->value,
2645 p->scheme ? lwc_string_data(p->scheme) :
2646 "unused",
2647 p->url ? nsurl_access(p->url) :
2648 "unused",
2649 c->comment ? c->comment : "");
2650 }
2651 }
2652
2653 if (p->children != NULL) {
2654 p = p->children;
2655 } else {
2656 while (p != parent) {
2657 if (p->next != NULL) {
2658 p = p->next;
2659 break;
2660 }
2661
2662 p = p->parent;
2663 }
2664 }
2665 } while (p != parent);
2666}
2667
2668
2669/**
2670 * Save a host subtree's cookies
2671 *
2672 * \param fp File pointer to write to
2673 * \param parent Parent host
2674 */
2675static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent)
2676{
2677 struct host_part *h;
2678 assert(fp && parent);
2679
2680 urldb_save_cookie_paths(fp, &parent->paths);
2681
2682 for (h = parent->children; h; h = h->next)
2684}
2685
2686
2687/**
2688 * Destroy a cookie node
2689 *
2690 * \param c Cookie to destroy
2691 */
2693{
2694 free(c->name);
2695 free(c->value);
2696 free(c->comment);
2697 free(c->domain);
2698 free(c->path);
2699
2700 free(c);
2701}
2702
2703
2704/**
2705 * Destroy the contents of a path node
2706 *
2707 * \param node Node to destroy contents of (does not destroy node)
2708 */
2710{
2711 struct cookie_internal_data *a, *b;
2712 unsigned int i;
2713
2714 if (node->url != NULL) {
2715 nsurl_unref(node->url);
2716 }
2717
2718 if (node->scheme != NULL) {
2719 lwc_string_unref(node->scheme);
2720 }
2721
2722 free(node->segment);
2723 for (i = 0; i < node->frag_cnt; i++)
2724 free(node->fragment[i]);
2725 free(node->fragment);
2726
2727 free(node->urld.title);
2728
2729 for (a = node->cookies; a; a = b) {
2730 b = a->next;
2732 }
2733}
2734
2735
2736/**
2737 * Destroy protection space data
2738 *
2739 * \param space Protection space to destroy
2740 */
2742{
2743 lwc_string_unref(space->scheme);
2744 free(space->realm);
2745 free(space->auth);
2746
2747 free(space);
2748}
2749
2750
2751/**
2752 * Destroy a path tree
2753 *
2754 * \param root Root node of tree to destroy
2755 */
2757{
2758 struct path_data *p = root;
2759
2760 do {
2761 if (p->children != NULL) {
2762 p = p->children;
2763 } else {
2764 struct path_data *q = p;
2765
2766 while (p != root) {
2767 if (p->next != NULL) {
2768 p = p->next;
2769 break;
2770 }
2771
2772 p = p->parent;
2773
2775 free(q);
2776
2777 q = p;
2778 }
2779
2781 free(q);
2782 }
2783 } while (p != root);
2784}
2785
2786
2787/**
2788 * Destroy a host tree
2789 *
2790 * \param root Root node of tree to destroy
2791 */
2793{
2794 struct host_part *a, *b;
2795 struct path_data *p, *q;
2796 struct prot_space_data *s, *t;
2797
2798 /* Destroy children */
2799 for (a = root->children; a; a = b) {
2800 b = a->next;
2802 }
2803
2804 /* Now clean up paths */
2805 for (p = root->paths.children; p; p = q) {
2806 q = p->next;
2808 }
2809
2810 /* Root path */
2812
2813 /* Proctection space data */
2814 for (s = root->prot_space; s; s = t) {
2815 t = s->next;
2817 }
2818
2819 /* And ourselves */
2820 free(root->part);
2821 free(root);
2822}
2823
2824
2825/**
2826 * Destroy a search tree
2827 *
2828 * \param root Root node of tree to destroy
2829 */
2831{
2832 /* Destroy children */
2833 if (root->left != &empty)
2835 if (root->right != &empty)
2837
2838 /* And destroy ourselves */
2839 free(root);
2840}
2841
2842
2843/*************** External interface ***************/
2844
2845
2846/* exported interface documented in content/urldb.h */
2848{
2849 struct host_part *a, *b;
2850 int i;
2851
2852 /* Clean up search trees */
2853 for (i = 0; i < NUM_SEARCH_TREES; i++) {
2854 if (search_trees[i] != &empty) {
2856 search_trees[i] = &empty;
2857 }
2858 }
2859
2860 /* And database */
2861 for (a = db_root.children; a; a = b) {
2862 b = a->next;
2864 }
2865 memset(&db_root, 0, sizeof(db_root));
2866
2867 /* And the bloom filter */
2868 if (url_bloom != NULL) {
2870 url_bloom = NULL;
2871 }
2872}
2873
2874
2875/* exported interface documented in netsurf/url_db.h */
2876nserror urldb_load(const char *filename)
2877{
2878#define MAXIMUM_URL_LENGTH 4096
2879 char s[MAXIMUM_URL_LENGTH];
2880 char host[256];
2881 struct host_part *h;
2882 int urls;
2883 int i;
2884 int version;
2885 int length;
2886 FILE *fp;
2887
2888 assert(filename);
2889
2890 NSLOG(netsurf, INFO, "Loading URL file %s", filename);
2891
2892 if (url_bloom == NULL)
2894
2895 fp = fopen(filename, "r");
2896 if (!fp) {
2897 NSLOG(netsurf, INFO, "Failed to open file '%s' for reading",
2898 filename);
2899 return NSERROR_NOT_FOUND;
2900 }
2901
2902 if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
2903 fclose(fp);
2904 return NSERROR_NEED_DATA;
2905 }
2906
2907 version = atoi(s);
2909 NSLOG(netsurf, INFO, "Unsupported URL file version.");
2910 fclose(fp);
2911 return NSERROR_INVALID;
2912 }
2913 if (version > URL_FILE_VERSION) {
2914 NSLOG(netsurf, INFO, "Unknown URL file version.");
2915 fclose(fp);
2916 return NSERROR_INVALID;
2917 }
2918
2919 while (fgets(host, sizeof host, fp)) {
2920 time_t hsts_expiry = 0;
2921 int hsts_include_sub_domains = 0;
2922
2923 /* get the hostname */
2924 length = strlen(host) - 1;
2925 host[length] = '\0';
2926
2927 /* skip data that has ended up with a host of '' */
2928 if (length == 0) {
2929 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2930 break;
2931 urls = atoi(s);
2932 /* Eight fields/url */
2933 for (i = 0; i < (8 * urls); i++) {
2934 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2935 break;
2936 }
2937 continue;
2938 }
2939
2940 if (version >= 107) {
2941 char *p = host;
2942 while (*p && *p != ' ') p++;
2943 while (*p && *p == ' ') { *p = '\0'; p++; }
2944 hsts_include_sub_domains = (*p == '1');
2945 while (*p && *p != ' ') p++;
2946 while (*p && *p == ' ') p++;
2947 nsc_snptimet(p, strlen(p), &hsts_expiry);
2948 }
2949
2950 h = urldb_add_host(host);
2951 if (!h) {
2952 NSLOG(netsurf, INFO, "Failed adding host: '%s'", host);
2953 fclose(fp);
2954 return NSERROR_NOMEM;
2955 }
2956 h->hsts.expires = hsts_expiry;
2957 h->hsts.include_sub_domains = hsts_include_sub_domains;
2958
2959 /* read number of URLs */
2960 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2961 break;
2962 urls = atoi(s);
2963
2964 /* no URLs => try next host */
2965 if (urls == 0) {
2966 NSLOG(netsurf, INFO, "No URLs for '%s'", host);
2967 continue;
2968 }
2969
2970 /* load the non-corrupt data */
2971 for (i = 0; i < urls; i++) {
2972 struct path_data *p = NULL;
2973 char scheme[64], ports[10];
2974 char url[64 + 3 + 256 + 6 + 4096 + 1 + 1];
2975 unsigned int port;
2976 bool is_file = false;
2977 nsurl *nsurl;
2978 lwc_string *scheme_lwc, *fragment_lwc;
2979 char *path_query;
2980 size_t len;
2981
2982 if (!fgets(scheme, sizeof scheme, fp))
2983 break;
2984 length = strlen(scheme) - 1;
2985 scheme[length] = '\0';
2986
2987 if (!fgets(ports, sizeof ports, fp))
2988 break;
2989 length = strlen(ports) - 1;
2990 ports[length] = '\0';
2991 port = atoi(ports);
2992
2993 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2994 break;
2995 length = strlen(s) - 1;
2996 s[length] = '\0';
2997
2998 if (!strcasecmp(host, "localhost") &&
2999 !strcasecmp(scheme, "file"))
3000 is_file = true;
3001
3002 snprintf(url, sizeof url, "%s://%s%s%s%s",
3003 scheme,
3004 /* file URLs have no host */
3005 (is_file ? "" : host),
3006 (port ? ":" : ""),
3007 (port ? ports : ""),
3008 s);
3009
3010 /* TODO: store URLs in pre-parsed state, and make
3011 * a nsurl_load to generate the nsurl more
3012 * swiftly.
3013 * Need a nsurl_save too.
3014 */
3015 if (nsurl_create(url, &nsurl) != NSERROR_OK) {
3016 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3017 url);
3018 fclose(fp);
3019 return NSERROR_NOMEM;
3020 }
3021
3022 if (url_bloom != NULL) {
3023 uint32_t hash = nsurl_hash(nsurl);
3025 }
3026
3027 /* Copy and merge path/query strings */
3029 &path_query, &len) != NSERROR_OK) {
3030 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3031 url);
3032 fclose(fp);
3033 return NSERROR_NOMEM;
3034 }
3035
3036 scheme_lwc = nsurl_get_component(nsurl, NSURL_SCHEME);
3037 fragment_lwc = nsurl_get_component(nsurl,
3039 p = urldb_add_path(scheme_lwc, port, h, path_query,
3040 fragment_lwc, nsurl);
3041 if (!p) {
3042 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3043 url);
3044 fclose(fp);
3045 return NSERROR_NOMEM;
3046 }
3048 lwc_string_unref(scheme_lwc);
3049 if (fragment_lwc != NULL)
3050 lwc_string_unref(fragment_lwc);
3051
3052 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3053 break;
3054 if (p)
3055 p->urld.visits = (unsigned int)atoi(s);
3056
3057 /* entry last use time */
3058 if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
3059 break;
3060 }
3061 if (p) {
3062 nsc_snptimet(s, strlen(s) - 1, &p->urld.last_visit);
3063 }
3064
3065 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3066 break;
3067 if (p)
3068 p->urld.type = (content_type)atoi(s);
3069
3070 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3071 break;
3072
3073
3074 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3075 break;
3076 length = strlen(s) - 1;
3077 if (p && length > 0) {
3078 s[length] = '\0';
3079 p->urld.title = malloc(length + 1);
3080 if (p->urld.title)
3081 memcpy(p->urld.title, s, length + 1);
3082 }
3083 }
3084 }
3085
3086 fclose(fp);
3087 NSLOG(netsurf, INFO, "Successfully loaded URL file");
3088#undef MAXIMUM_URL_LENGTH
3089
3090 return NSERROR_OK;
3091}
3092
3093/* exported interface documented in netsurf/url_db.h */
3094nserror urldb_save(const char *filename)
3095{
3096 FILE *fp;
3097 int i;
3098
3099 assert(filename);
3100
3101 fp = fopen(filename, "w");
3102 if (!fp) {
3103 NSLOG(netsurf, INFO, "Failed to open file '%s' for writing",
3104 filename);
3105 return NSERROR_SAVE_FAILED;
3106 }
3107
3108 /* file format version number */
3109 fprintf(fp, "%d\n", URL_FILE_VERSION);
3110
3111 for (i = 0; i != NUM_SEARCH_TREES; i++) {
3113 }
3114
3115 fclose(fp);
3116
3117 return NSERROR_OK;
3118}
3119
3120
3121/* exported interface documented in content/urldb.h */
3123{
3124 struct path_data *p;
3125
3126 assert(url);
3127
3128 p = urldb_find_url(url);
3129 if (!p) {
3130 return NSERROR_NOT_FOUND;
3131 }
3132
3133 p->persistent = persist;
3134
3135 return NSERROR_OK;
3136}
3137
3138
3139/* exported interface documented in content/urldb.h */
3141{
3142 struct host_part *h;
3143 struct path_data *p;
3144 lwc_string *scheme;
3145 lwc_string *port;
3146 lwc_string *host;
3147 lwc_string *fragment;
3148 const char *host_str;
3149 char *path_query = NULL;
3150 size_t len;
3151 bool match;
3152 unsigned int port_int;
3153
3154 assert(url);
3155
3156 if (url_bloom == NULL)
3158
3159 if (url_bloom != NULL) {
3160 uint32_t hash = nsurl_hash(url);
3162 }
3163
3164 /* Copy and merge path/query strings */
3165 if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) !=
3166 NSERROR_OK) {
3167 return false;
3168 }
3169 assert(path_query != NULL);
3170
3172 if (scheme == NULL) {
3173 free(path_query);
3174 return false;
3175 }
3176
3178 if (host != NULL) {
3179 host_str = lwc_string_data(host);
3180 lwc_string_unref(host);
3181
3182 } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
3183 lwc_error_ok && match == true) {
3184 host_str = "localhost";
3185
3186 } else {
3187 lwc_string_unref(scheme);
3188 free(path_query);
3189 return false;
3190 }
3191
3193
3195 if (port != NULL) {
3196 port_int = atoi(lwc_string_data(port));
3197 lwc_string_unref(port);
3198 } else {
3199 port_int = 0;
3200 }
3201
3202 /* Get host entry */
3203 h = urldb_add_host(host_str);
3204
3205 /* Get path entry */
3206 if (h != NULL) {
3208 port_int,
3209 h,
3210 path_query,
3211 fragment,
3212 url);
3213 } else {
3214 p = NULL;
3215 }
3216
3217 lwc_string_unref(scheme);
3218 if (fragment != NULL)
3219 lwc_string_unref(fragment);
3220
3221 return (p != NULL);
3222}
3223
3224
3225/* exported interface documented in content/urldb.h */
3227{
3228 struct path_data *p;
3229 char *temp;
3230
3231 assert(url);
3232
3233 p = urldb_find_url(url);
3234 if (p == NULL) {
3235 return NSERROR_NOT_FOUND;
3236 }
3237
3238 /* copy the parameter if necessary */
3239 if (title != NULL) {
3240 temp = strdup(title);
3241 if (temp == NULL) {
3242 return NSERROR_NOMEM;
3243 }
3244 } else {
3245 temp = NULL;
3246 }
3247
3248 free(p->urld.title);
3249 p->urld.title = temp;
3250
3251 return NSERROR_OK;
3252}
3253
3254
3255/* exported interface documented in content/urldb.h */
3257{
3258 struct path_data *p;
3259
3260 assert(url);
3261
3262 p = urldb_find_url(url);
3263 if (!p) {
3264 return NSERROR_NOT_FOUND;
3265 }
3266
3267 p->urld.type = type;
3268
3269 return NSERROR_OK;
3270}
3271
3272
3273/* exported interface documented in content/urldb.h */
3275{
3276 struct path_data *p;
3277
3278 assert(url);
3279
3280 p = urldb_find_url(url);
3281 if (!p) {
3282 return NSERROR_NOT_FOUND;
3283 }
3284
3285 p->urld.last_visit = time(NULL);
3286 p->urld.visits++;
3287
3288 return NSERROR_OK;
3289}
3290
3291
3292/* exported interface documented in content/urldb.h */
3294{
3295 struct path_data *p;
3296
3297 assert(url);
3298
3299 p = urldb_find_url(url);
3300 if (!p)
3301 return;
3302
3303 p->urld.last_visit = (time_t)0;
3304 p->urld.visits = 0;
3305}
3306
3307
3308/* exported interface documented in netsurf/url_db.h */
3310{
3311 struct path_data *p;
3312 struct url_internal_data *u;
3313
3314 assert(url);
3315
3316 p = urldb_find_url(url);
3317 if (!p)
3318 return NULL;
3319
3320 u = &p->urld;
3321
3322 return (const struct url_data *) u;
3323}
3324
3325
3326/* exported interface documented in content/urldb.h */
3328{
3329 struct path_data *p;
3330
3331 assert(url);
3332
3333 p = urldb_find_url(url);
3334 if (!p)
3335 return NULL;
3336
3337 return p->url;
3338}
3339
3340
3341/* exported interface documented in netsurf/url_db.h */
3342void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth)
3343{
3344 struct path_data *p, *pi;
3345 struct host_part *h;
3346 struct prot_space_data *space, *space_alloc;
3347 char *realm_alloc, *auth_alloc;
3348 bool match;
3349
3350 assert(url && realm && auth);
3351
3352 /* add url, in case it's missing */
3353 urldb_add_url(url);
3354
3355 p = urldb_find_url(url);
3356
3357 if (!p)
3358 return;
3359
3360 /* Search for host_part */
3361 for (pi = p; pi->parent != NULL; pi = pi->parent)
3362 ;
3363 h = (struct host_part *)pi;
3364
3365 /* Search if given URL belongs to a protection space we already know of. */
3366 for (space = h->prot_space; space; space = space->next) {
3367 if (!strcmp(space->realm, realm) &&
3368 lwc_string_isequal(space->scheme, p->scheme,
3369 &match) == lwc_error_ok &&
3370 match == true &&
3371 space->port == p->port)
3372 break;
3373 }
3374
3375 if (space != NULL) {
3376 /* Overrule existing auth. */
3377 free(space->auth);
3378 space->auth = strdup(auth);
3379 } else {
3380 /* Create a new protection space. */
3381 space = space_alloc = malloc(sizeof(struct prot_space_data));
3382 realm_alloc = strdup(realm);
3383 auth_alloc = strdup(auth);
3384
3385 if (!space_alloc || !realm_alloc || !auth_alloc) {
3386 free(space_alloc);
3387 free(realm_alloc);
3388 free(auth_alloc);
3389 return;
3390 }
3391
3392 space->scheme = lwc_string_ref(p->scheme);
3393 space->port = p->port;
3394 space->realm = realm_alloc;
3395 space->auth = auth_alloc;
3396 space->next = h->prot_space;
3397 h->prot_space = space;
3398 }
3399
3400 p->prot_space = space;
3401}
3402
3403
3404/* exported interface documented in netsurf/url_db.h */
3405const char *urldb_get_auth_details(nsurl *url, const char *realm)
3406{
3407 struct path_data *p, *p_cur, *p_top;
3408
3409 assert(url);
3410
3411 /* add to the db, so our lookup will work */
3413
3414 p = urldb_find_url(url);
3415 if (!p)
3416 return NULL;
3417
3418 /* Check for any auth details attached to the path_data node or any of
3419 * its parents.
3420 */
3421 for (p_cur = p; p_cur != NULL; p_top = p_cur, p_cur = p_cur->parent) {
3422 if (p_cur->prot_space) {
3423 return p_cur->prot_space->auth;
3424 }
3425 }
3426
3427 /* Only when we have a realm (and canonical root of given URL), we can
3428 * uniquely locate the protection space.
3429 */
3430 if (realm != NULL) {
3431 const struct host_part *h = (const struct host_part *)p_top;
3432 const struct prot_space_data *space;
3433 bool match;
3434
3435 /* Search for a possible matching protection space. */
3436 for (space = h->prot_space; space != NULL;
3437 space = space->next) {
3438 if (!strcmp(space->realm, realm) &&
3439 lwc_string_isequal(space->scheme,
3440 p->scheme, &match) ==
3441 lwc_error_ok &&
3442 match == true &&
3443 space->port == p->port) {
3444 p->prot_space = space;
3445 return p->prot_space->auth;
3446 }
3447 }
3448 }
3449
3450 return NULL;
3451}
3452
3453
3454/* exported interface documented in netsurf/url_db.h */
3455void urldb_set_cert_permissions(nsurl *url, bool permit)
3456{
3457 struct path_data *p;
3458 struct host_part *h;
3459
3460 assert(url);
3461
3462 /* add url, in case it's missing */
3463 urldb_add_url(url);
3464
3465 p = urldb_find_url(url);
3466 if (!p)
3467 return;
3468
3469 for (; p && p->parent; p = p->parent)
3470 /* do nothing */;
3471 assert(p);
3472
3473 h = (struct host_part *)p;
3474
3475 h->permit_invalid_certs = permit;
3476}
3477
3478
3479/* exported interface documented in content/urldb.h */
3481{
3482 struct path_data *p;
3483 const struct host_part *h;
3484
3485 assert(url);
3486
3487 p = urldb_find_url(url);
3488 if (!p)
3489 return false;
3490
3491 for (; p && p->parent; p = p->parent)
3492 /* do nothing */;
3493 assert(p);
3494
3495 h = (const struct host_part *)p;
3496
3497 return h->permit_invalid_certs;
3498}
3499
3500
3501/* exported interface documented in content/urldb.h */
3502bool urldb_set_hsts_policy(struct nsurl *url, const char *header)
3503{
3504 struct path_data *p;
3505 struct host_part *h;
3506 lwc_string *host;
3507 time_t now = time(NULL);
3509 uint32_t max_age = 0;
3510 nserror error;
3511
3512 assert(url);
3513
3514 host = nsurl_get_component(url, NSURL_HOST);
3515 if (host != NULL) {
3516 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3517 /* Host is IP: ignore */
3518 lwc_string_unref(host);
3519 return true;
3520 } else if (lwc_string_length(host) == 0) {
3521 /* Host is blank: ignore */
3522 lwc_string_unref(host);
3523 return true;
3524 }
3525
3526 lwc_string_unref(host);
3527 } else {
3528 /* No host part: ignore */
3529 return true;
3530 }
3531
3532 /* add url, in case it's missing */
3533 urldb_add_url(url);
3534
3535 p = urldb_find_url(url);
3536 if (!p)
3537 return false;
3538
3539 for (; p && p->parent; p = p->parent)
3540 /* do nothing */;
3541 assert(p);
3542
3543 h = (struct host_part *)p;
3544 if (h->permit_invalid_certs) {
3545 /* Transport is tainted: ignore */
3546 return true;
3547 }
3548
3549 error = http_parse_strict_transport_security(header, &sts);
3550 if (error != NSERROR_OK) {
3551 /* Parse failed: ignore */
3552 return true;
3553 }
3554
3557
3559 if (max_age == 0) {
3560 h->hsts.expires = 0;
3561 h->hsts.include_sub_domains = false;
3562 } else if ((time_t) (now + max_age) > h->hsts.expires) {
3563 h->hsts.expires = now + max_age;
3564 }
3565
3567
3568 return true;
3569}
3570
3571
3572/* exported interface documented in content/urldb.h */
3574{
3575 struct path_data *p;
3576 const struct host_part *h;
3577 lwc_string *host;
3578 time_t now = time(NULL);
3579
3580 assert(url);
3581
3582 host = nsurl_get_component(url, NSURL_HOST);
3583 if (host != NULL) {
3584 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3585 /* Host is IP: not enabled */
3586 lwc_string_unref(host);
3587 return false;
3588 } else if (lwc_string_length(host) == 0) {
3589 /* Host is blank: not enabled */
3590 lwc_string_unref(host);
3591 return false;
3592 }
3593
3594 lwc_string_unref(host);
3595 } else {
3596 /* No host part: not enabled */
3597 return false;
3598 }
3599
3600 /* The URL must exist in the db in order to find HSTS policy, since
3601 * we search up the tree from the URL node, and policy from further
3602 * up may also apply. */
3603 urldb_add_url(url);
3604
3605 p = urldb_find_url(url);
3606 if (!p)
3607 return false;
3608
3609 for (; p && p->parent; p = p->parent)
3610 /* do nothing */;
3611 assert(p);
3612
3613 h = (const struct host_part *)p;
3614
3615 /* Consult record for this host */
3616 if (h->hsts.expires > now) {
3617 /* Not expired */
3618 return true;
3619 }
3620
3621 /* Consult parent domains */
3622 for (h = h->parent; h && h != &db_root; h = h->parent) {
3623 if (h->hsts.expires > now && h->hsts.include_sub_domains) {
3624 /* Not expired and subdomains included */
3625 return true;
3626 }
3627 }
3628
3629 return false;
3630}
3631
3632
3633/* exported interface documented in netsurf/url_db.h */
3634void
3635urldb_iterate_partial(const char *prefix,
3636 bool (*callback)(nsurl *url, const struct url_data *data))
3637{
3638 char host[256];
3639 char buf[260]; /* max domain + "www." */
3640 const char *slash, *scheme_sep;
3641 struct search_node *tree;
3642 const struct host_part *h;
3643
3644 assert(prefix && callback);
3645
3646 /* strip scheme */
3647 scheme_sep = strstr(prefix, "://");
3648 if (scheme_sep)
3649 prefix = scheme_sep + 3;
3650
3651 slash = strchr(prefix, '/');
3652 tree = urldb_get_search_tree(prefix);
3653
3654 if (slash) {
3655 /* if there's a slash in the input, then we can
3656 * assume that we're looking for a path */
3657 snprintf(host, sizeof host, "%.*s",
3658 (int) (slash - prefix), prefix);
3659
3660 h = urldb_search_find(tree, host);
3661 if (!h) {
3662 int len = slash - prefix;
3663
3664 if (len <= 3 || strncasecmp(host, "www.", 4) != 0) {
3665 snprintf(buf, sizeof buf, "www.%s", host);
3667 search_trees[ST_DN + 'w' - 'a'],
3668 buf);
3669 if (!h)
3670 return;
3671 } else
3672 return;
3673 }
3674
3675 if (h->paths.children) {
3676 /* Have paths, iterate them */
3677 urldb_iterate_partial_path(&h->paths, slash + 1,
3678 callback);
3679 }
3680
3681 } else {
3682 int len = strlen(prefix);
3683
3684 /* looking for hosts */
3685 if (!urldb_iterate_partial_host(tree, prefix, callback))
3686 return;
3687
3688 if (len <= 3 || strncasecmp(prefix, "www.", 4) != 0) {
3689 /* now look for www.prefix */
3690 snprintf(buf, sizeof buf, "www.%s", prefix);
3692 search_trees[ST_DN + 'w' - 'a'],
3693 buf, callback))
3694 return;
3695 }
3696 }
3697}
3698
3699
3700/* exported interface documented in netsurf/url_db.h */
3701void
3702urldb_iterate_entries(bool (*callback)(nsurl *url, const struct url_data *data))
3703{
3704 int i;
3705
3706 assert(callback);
3707
3708 for (i = 0; i < NUM_SEARCH_TREES; i++) {
3710 callback,
3711 NULL)) {
3712 break;
3713 }
3714 }
3715}
3716
3717
3718/* exported interface documented in content/urldb.h */
3719void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data))
3720{
3721 int i;
3722
3723 assert(callback);
3724
3725 for (i = 0; i < NUM_SEARCH_TREES; i++) {
3727 NULL, callback))
3728 break;
3729 }
3730}
3731
3732
3733/* exported interface documented in content/urldb.h */
3734bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer)
3735{
3736 const char *cur = header, *end;
3737 lwc_string *path, *host, *scheme;
3738 nsurl *urlt;
3739 bool match;
3740
3741 assert(url && header);
3742
3743 /* Get defragmented URL, as 'urlt' */
3744 if (nsurl_defragment(url, &urlt) != NSERROR_OK)
3745 return NULL;
3746
3747 scheme = nsurl_get_component(url, NSURL_SCHEME);
3748 if (scheme == NULL) {
3749 nsurl_unref(urlt);
3750 return false;
3751 }
3752
3754 if (path == NULL) {
3755 lwc_string_unref(scheme);
3756 nsurl_unref(urlt);
3757 return false;
3758 }
3759
3760 host = nsurl_get_component(url, NSURL_HOST);
3761 if (host == NULL) {
3762 lwc_string_unref(path);
3763 lwc_string_unref(scheme);
3764 nsurl_unref(urlt);
3765 return false;
3766 }
3767
3768 if (referer) {
3769 lwc_string *rhost;
3770
3771 /* Ensure that url's host name domain matches
3772 * referer's (4.3.5) */
3773 rhost = nsurl_get_component(referer, NSURL_HOST);
3774 if (rhost == NULL) {
3775 goto error;
3776 }
3777
3778 /* Domain match host names */
3779 if (lwc_string_isequal(host, rhost, &match) == lwc_error_ok &&
3780 match == false) {
3781 const char *hptr;
3782 const char *rptr;
3783 const char *dot;
3784 const char *host_data = lwc_string_data(host);
3785 const char *rhost_data = lwc_string_data(rhost);
3786
3787 /* Ensure neither host nor rhost are IP addresses */
3788 if (urldb__host_is_ip_address(host_data) ||
3789 urldb__host_is_ip_address(rhost_data)) {
3790 /* IP address, so no partial match */
3791 lwc_string_unref(rhost);
3792 goto error;
3793 }
3794
3795 /* Not exact match, so try the following:
3796 *
3797 * 1) Find the longest common suffix of host and rhost
3798 * (may be all of host/rhost)
3799 * 2) Discard characters from the start of the suffix
3800 * until the suffix starts with a dot
3801 * (prevents foobar.com matching bar.com)
3802 * 3) Ensure the suffix is non-empty and contains
3803 * embedded dots (to avoid permitting .com as a
3804 * suffix)
3805 *
3806 * Note that the above in no way resembles the
3807 * domain matching algorithm found in RFC2109.
3808 * It does, however, model the real world rather
3809 * more accurately.
3810 */
3811
3812 /** \todo In future, we should consult a TLD service
3813 * instead of just looking for embedded dots.
3814 */
3815
3816 hptr = host_data + lwc_string_length(host) - 1;
3817 rptr = rhost_data + lwc_string_length(rhost) - 1;
3818
3819 /* 1 */
3820 while (hptr >= host_data && rptr >= rhost_data) {
3821 if (*hptr != *rptr)
3822 break;
3823 hptr--;
3824 rptr--;
3825 }
3826 /* Ensure we end up pointing at the start of the
3827 * common suffix. The above loop will exit pointing
3828 * to the byte before the start of the suffix. */
3829 hptr++;
3830
3831 /* 2 */
3832 while (*hptr != '\0' && *hptr != '.')
3833 hptr++;
3834
3835 /* 3 */
3836 if (*hptr == '\0' ||
3837 (dot = strchr(hptr + 1, '.')) == NULL ||
3838 *(dot + 1) == '\0') {
3839 lwc_string_unref(rhost);
3840 goto error;
3841 }
3842 }
3843
3844 lwc_string_unref(rhost);
3845 }
3846
3847 end = cur + strlen(cur) - 2 /* Trailing CRLF */;
3848
3849 do {
3850 struct cookie_internal_data *c;
3851 char *dot;
3852 size_t len;
3853#ifdef WITH_NSPSL
3854 const char *suffix;
3855#endif
3856
3857 c = urldb_parse_cookie(url, &cur);
3858 if (!c) {
3859 /* failed => stop parsing */
3860 goto error;
3861 }
3862
3863 /* validate cookie */
3864
3865 /* 4.2.2:i Cookie must have NAME and VALUE */
3866 if (!c->name || !c->value) {
3868 goto error;
3869 }
3870
3871 /* 4.3.2:i Cookie path must be a prefix of URL path */
3872 len = strlen(c->path);
3873 if (len > lwc_string_length(path) ||
3874 strncmp(c->path, lwc_string_data(path),
3875 len) != 0) {
3877 goto error;
3878 }
3879
3880#ifdef WITH_NSPSL
3881 /* check domain is not a public suffix */
3882 dot = c->domain;
3883 if (*dot == '.') {
3884 dot++;
3885 }
3886 suffix = nspsl_getpublicsuffix(dot);
3887 if (suffix == NULL) {
3888 NSLOG(netsurf, INFO,
3889 "domain %s was a public suffix domain", dot);
3891 goto error;
3892 }
3893#else
3894 /* 4.3.2:ii Cookie domain must contain embedded dots */
3895 dot = strchr(c->domain + 1, '.');
3896 if (!dot || *(dot + 1) == '\0') {
3897 /* no embedded dots */
3899 goto error;
3900 }
3901#endif
3902
3903 /* Domain match fetch host with cookie domain */
3904 if (strcasecmp(lwc_string_data(host), c->domain) != 0) {
3905 int hlen, dlen;
3906 char *domain = c->domain;
3907
3908 /* c->domain must be a domain cookie here because:
3909 * c->domain is either:
3910 * + specified in the header as a domain cookie
3911 * (non-domain cookies in the header are ignored
3912 * by urldb_parse_cookie / urldb_parse_avpair)
3913 * + defaulted to the URL's host part
3914 * (by urldb_parse_cookie if no valid domain was
3915 * specified in the header)
3916 *
3917 * The latter will pass the strcasecmp above, which
3918 * leaves the former (i.e. a domain cookie)
3919 */
3920 assert(c->domain[0] == '.');
3921
3922 /* 4.3.2:iii */
3923 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3924 /* IP address, so no partial match */
3926 goto error;
3927 }
3928
3929 hlen = lwc_string_length(host);
3930 dlen = strlen(c->domain);
3931
3932 if (hlen <= dlen && hlen != dlen - 1) {
3933 /* Partial match not possible */
3935 goto error;
3936 }
3937
3938 if (hlen == dlen - 1) {
3939 /* Relax matching to allow
3940 * host a.com to match .a.com */
3941 domain++;
3942 dlen--;
3943 }
3944
3945 if (strcasecmp(lwc_string_data(host) + (hlen - dlen),
3946 domain)) {
3948 goto error;
3949 }
3950
3951 /* 4.3.2:iv Ensure H contains no dots
3952 *
3953 * If you believe the spec, H should contain no
3954 * dots in _any_ cookie. Unfortunately, however,
3955 * reality differs in that many sites send domain
3956 * cookies of the form .foo.com from hosts such
3957 * as bar.bat.foo.com and then expect domain
3958 * matching to work. Thus we have to do what they
3959 * expect, regardless of any potential security
3960 * implications.
3961 *
3962 * This is what code conforming to the spec would
3963 * look like:
3964 *
3965 * for (int i = 0; i < (hlen - dlen); i++) {
3966 * if (host[i] == '.') {
3967 * urldb_free_cookie(c);
3968 * goto error;
3969 * }
3970 * }
3971 */
3972 }
3973
3974 /* Now insert into database */
3975 if (!urldb_insert_cookie(c, scheme, urlt))
3976 goto error;
3977 } while (cur < end);
3978
3979 lwc_string_unref(host);
3980 lwc_string_unref(path);
3981 lwc_string_unref(scheme);
3982 nsurl_unref(urlt);
3983
3984 return true;
3985
3986error:
3987 lwc_string_unref(host);
3988 lwc_string_unref(path);
3989 lwc_string_unref(scheme);
3990 nsurl_unref(urlt);
3991
3992 return false;
3993}
3994
3995
3996/* exported interface documented in content/urldb.h */
3997char *urldb_get_cookie(nsurl *url, bool include_http_only)
3998{
3999 const struct path_data *p, *q;
4000 const struct host_part *h;
4001 lwc_string *path_lwc;
4002 struct cookie_internal_data *c;
4003 int count = 0, version = COOKIE_RFC2965;
4004 struct cookie_internal_data **matched_cookies;
4005 int matched_cookies_size = 20;
4006 int ret_alloc = 4096, ret_used = 1;
4007 const char *path;
4008 char *ret;
4009 lwc_string *scheme;
4010 time_t now;
4011 int i;
4012 bool match;
4013
4014 assert(url != NULL);
4015
4016 /* The URL must exist in the db in order to find relevant cookies, since
4017 * we search up the tree from the URL node, and cookies from further
4018 * up also apply. */
4019 urldb_add_url(url);
4020
4021 p = urldb_find_url(url);
4022 if (!p)
4023 return NULL;
4024
4025 scheme = p->scheme;
4026
4027 matched_cookies = malloc(matched_cookies_size *
4028 sizeof(struct cookie_internal_data *));
4029 if (!matched_cookies)
4030 return NULL;
4031
4032#define GROW_MATCHED_COOKIES \
4033 do { \
4034 if (count == matched_cookies_size) { \
4035 struct cookie_internal_data **temp; \
4036 temp = realloc(matched_cookies, \
4037 (matched_cookies_size + 20) * \
4038 sizeof(struct cookie_internal_data *)); \
4039 \
4040 if (temp == NULL) { \
4041 free(ret); \
4042 free(matched_cookies); \
4043 return NULL; \
4044 } \
4045 \
4046 matched_cookies = temp; \
4047 matched_cookies_size += 20; \
4048 } \
4049 } while(0)
4050
4051 ret = malloc(ret_alloc);
4052 if (!ret) {
4053 free(matched_cookies);
4054 return NULL;
4055 }
4056
4057 ret[0] = '\0';
4058
4059 path_lwc = nsurl_get_component(url, NSURL_PATH);
4060 if (path_lwc == NULL) {
4061 free(ret);
4062 free(matched_cookies);
4063 return NULL;
4064 }
4065 path = lwc_string_data(path_lwc);
4066 lwc_string_unref(path_lwc);
4067
4068 now = time(NULL);
4069
4070 if (*(p->segment) != '\0') {
4071 /* Match exact path, unless directory, when prefix matching
4072 * will handle this case for us. */
4073 for (q = p->parent->children; q; q = q->next) {
4074 if (strcmp(q->segment, p->segment))
4075 continue;
4076
4077 /* Consider all cookies associated with
4078 * this exact path */
4079 for (c = q->cookies; c; c = c->next) {
4080 if (c->expires != -1 && c->expires < now)
4081 /* cookie has expired => ignore */
4082 continue;
4083
4084 if (c->secure && lwc_string_isequal(
4085 q->scheme,
4086 corestring_lwc_https,
4087 &match) &&
4088 match == false)
4089 /* secure cookie for insecure host.
4090 * ignore */
4091 continue;
4092
4093 if (c->http_only && !include_http_only)
4094 /* Ignore HttpOnly */
4095 continue;
4096
4097 matched_cookies[count++] = c;
4098
4100
4101 if (c->version < (unsigned int)version)
4102 version = c->version;
4103
4104 c->last_used = now;
4105
4106 cookie_manager_add((struct cookie_data *)c);
4107 }
4108 }
4109 }
4110
4111 /* Now consider cookies whose paths prefix-match ours */
4112 for (p = p->parent; p; p = p->parent) {
4113 /* Find directory's path entry(ies) */
4114 /* There are potentially multiple due to differing schemes */
4115 for (q = p->children; q; q = q->next) {
4116 if (*(q->segment) != '\0')
4117 continue;
4118
4119 for (c = q->cookies; c; c = c->next) {
4120 if (c->expires != -1 && c->expires < now)
4121 /* cookie has expired => ignore */
4122 continue;
4123
4124 if (c->secure && lwc_string_isequal(
4125 q->scheme,
4126 corestring_lwc_https,
4127 &match) &&
4128 match == false)
4129 /* Secure cookie for insecure server
4130 * => ignore */
4131 continue;
4132
4133 matched_cookies[count++] = c;
4134
4136
4137 if (c->version < (unsigned int) version)
4138 version = c->version;
4139
4140 c->last_used = now;
4141
4142 cookie_manager_add((struct cookie_data *)c);
4143 }
4144 }
4145
4146 if (!p->parent) {
4147 /* No parent, so bail here. This can't go in
4148 * the loop exit condition as we also want to
4149 * process the top-level node.
4150 *
4151 * If p->parent is NULL then p->cookies are
4152 * the domain cookies and thus we don't even
4153 * try matching against them.
4154 */
4155 break;
4156 }
4157
4158 /* Consider p itself - may be the result of Path=/foo */
4159 for (c = p->cookies; c; c = c->next) {
4160 if (c->expires != -1 && c->expires < now)
4161 /* cookie has expired => ignore */
4162 continue;
4163
4164 /* Ensure cookie path is a prefix of the resource */
4165 if (strncmp(c->path, path, strlen(c->path)) != 0)
4166 /* paths don't match => ignore */
4167 continue;
4168
4169 if (c->secure && lwc_string_isequal(p->scheme,
4170 corestring_lwc_https,
4171 &match) &&
4172 match == false)
4173 /* Secure cookie for insecure server
4174 * => ignore */
4175 continue;
4176
4177 matched_cookies[count++] = c;
4178
4180
4181 if (c->version < (unsigned int) version)
4182 version = c->version;
4183
4184 c->last_used = now;
4185
4186 cookie_manager_add((struct cookie_data *)c);
4187 }
4188
4189 }
4190
4191 /* Finally consider domain cookies for hosts which domain match ours */
4192 for (h = (const struct host_part *)p; h && h != &db_root;
4193 h = h->parent) {
4194 for (c = h->paths.cookies; c; c = c->next) {
4195 if (c->expires != -1 && c->expires < now)
4196 /* cookie has expired => ignore */
4197 continue;
4198
4199 /* Ensure cookie path is a prefix of the resource */
4200 if (strncmp(c->path, path, strlen(c->path)) != 0)
4201 /* paths don't match => ignore */
4202 continue;
4203
4204 if (c->secure && lwc_string_isequal(scheme,
4205 corestring_lwc_https,
4206 &match) &&
4207 match == false)
4208 /* secure cookie for insecure host. ignore */
4209 continue;
4210
4211 matched_cookies[count++] = c;
4212
4214
4215 if (c->version < (unsigned int)version)
4216 version = c->version;
4217
4218 c->last_used = now;
4219
4220 cookie_manager_add((struct cookie_data *)c);
4221 }
4222 }
4223
4224 if (count == 0) {
4225 /* No cookies found */
4226 free(ret);
4227 free(matched_cookies);
4228 return NULL;
4229 }
4230
4231 /* and build output string */
4232 if (version > COOKIE_NETSCAPE) {
4233 sprintf(ret, "$Version=%d", version);
4234 ret_used = strlen(ret) + 1;
4235 }
4236
4237 for (i = 0; i < count; i++) {
4238 if (!urldb_concat_cookie(matched_cookies[i], version,
4239 &ret_used, &ret_alloc, &ret)) {
4240 free(ret);
4241 free(matched_cookies);
4242 return NULL;
4243 }
4244 }
4245
4246 if (version == COOKIE_NETSCAPE) {
4247 /* Old-style cookies => no version & skip "; " */
4248 memmove(ret, ret + 2, ret_used - 2);
4249 ret_used -= 2;
4250 }
4251
4252 /* Now, shrink the output buffer to the required size */
4253 {
4254 char *temp = realloc(ret, ret_used);
4255 if (!temp) {
4256 free(ret);
4257 free(matched_cookies);
4258 return NULL;
4259 }
4260
4261 ret = temp;
4262 }
4263
4264 free(matched_cookies);
4265
4266 return ret;
4267
4268#undef GROW_MATCHED_COOKIES
4269}
4270
4271
4272/* exported interface documented in content/urldb.h */
4273void urldb_delete_cookie(const char *domain, const char *path,
4274 const char *name)
4275{
4277}
4278
4279
4280/* exported interface documented in content/urldb.h */
4281void urldb_load_cookies(const char *filename)
4282{
4283 FILE *fp;
4284 char s[16*1024];
4285
4286 assert(filename);
4287
4288 fp = fopen(filename, "r");
4289 if (!fp)
4290 return;
4291
4292#define FIND_T { \
4293 for (; *p && *p != '\t'; p++) \
4294 ; /* do nothing */ \
4295 if (p >= end) { \
4296 NSLOG(netsurf, INFO, "Overran input"); \
4297 continue; \
4298 } \
4299 *p++ = '\0'; \
4300 }
4301
4302#define SKIP_T { \
4303 for (; *p && *p == '\t'; p++) \
4304 ; /* do nothing */ \
4305 if (p >= end) { \
4306 NSLOG(netsurf, INFO, "Overran input"); \
4307 continue; \
4308 } \
4309 }
4310
4311 while (fgets(s, sizeof s, fp)) {
4312 char *p = s, *end = 0,
4313 *domain, *path, *name, *value, *scheme, *url,
4314 *comment;
4315 int version, domain_specified, path_specified,
4316 secure, http_only, no_destroy, value_quoted;
4317 time_t expires, last_used;
4318 struct cookie_internal_data *c;
4319
4320 if(s[0] == 0 || s[0] == '#')
4321 /* Skip blank lines or comments */
4322 continue;
4323
4324 s[strlen(s) - 1] = '\0'; /* lose terminating newline */
4325 end = s + strlen(s);
4326
4327 /* Look for file version first
4328 * (all input is ignored until this is read)
4329 */
4330 if (strncasecmp(s, "Version:", 8) == 0) {
4332
4335 NSLOG(netsurf, INFO,
4336 "Unsupported Cookie file version");
4337 break;
4338 }
4339
4340 continue;
4341 } else if (loaded_cookie_file_version == 0) {
4342 /* Haven't yet seen version; skip this input */
4343 continue;
4344 }
4345
4346 /* One cookie/line */
4347
4348 /* Parse input */
4349 FIND_T; version = atoi(s);
4350 SKIP_T; domain = p; FIND_T;
4351 SKIP_T; domain_specified = atoi(p); FIND_T;
4352 SKIP_T; path = p; FIND_T;
4353 SKIP_T; path_specified = atoi(p); FIND_T;
4354 SKIP_T; secure = atoi(p); FIND_T;
4355 if (loaded_cookie_file_version > 101) {
4356 /* Introduced in version 1.02 */
4357 SKIP_T; http_only = atoi(p); FIND_T;
4358 } else {
4359 http_only = 0;
4360 }
4361 SKIP_T; expires = (time_t)atoi(p); FIND_T;
4362 SKIP_T; last_used = (time_t)atoi(p); FIND_T;
4363 SKIP_T; no_destroy = atoi(p); FIND_T;
4364 SKIP_T; name = p; FIND_T;
4365 SKIP_T; value = p; FIND_T;
4366 if (loaded_cookie_file_version > 100) {
4367 /* Introduced in version 1.01 */
4368 SKIP_T; value_quoted = atoi(p); FIND_T;
4369 } else {
4370 value_quoted = 0;
4371 }
4372 SKIP_T; scheme = p; FIND_T;
4373 SKIP_T; url = p; FIND_T;
4374
4375 /* Comment may have no content, so don't
4376 * use macros as they'll break */
4377 for (; *p && *p == '\t'; p++)
4378 ; /* do nothing */
4379 comment = p;
4380
4381 assert(p <= end);
4382
4383 /* Now create cookie */
4384 c = malloc(sizeof(struct cookie_internal_data));
4385 if (!c)
4386 break;
4387
4388 c->name = strdup(name);
4389 c->value = strdup(value);
4390 c->value_was_quoted = value_quoted;
4391 c->comment = strdup(comment);
4392 c->domain_from_set = domain_specified;
4393 c->domain = strdup(domain);
4394 c->path_from_set = path_specified;
4395 c->path = strdup(path);
4396 c->expires = expires;
4397 c->last_used = last_used;
4398 c->secure = secure;
4399 c->http_only = http_only;
4400 c->version = version;
4402
4403 if (!(c->name && c->value && c->comment &&
4404 c->domain && c->path)) {
4406 break;
4407 }
4408
4409 if (c->domain[0] != '.') {
4410 lwc_string *scheme_lwc = NULL;
4411 nsurl *url_nsurl = NULL;
4412
4413 assert(scheme[0] != 'u');
4414
4415 if (nsurl_create(url, &url_nsurl) != NSERROR_OK) {
4417 break;
4418 }
4419 scheme_lwc = nsurl_get_component(url_nsurl,
4420 NSURL_SCHEME);
4421
4422 /* And insert it into database */
4423 if (!urldb_insert_cookie(c, scheme_lwc, url_nsurl)) {
4424 /* Cookie freed for us */
4425 nsurl_unref(url_nsurl);
4426 lwc_string_unref(scheme_lwc);
4427 break;
4428 }
4429 nsurl_unref(url_nsurl);
4430 lwc_string_unref(scheme_lwc);
4431
4432 } else {
4433 if (!urldb_insert_cookie(c, NULL, NULL)) {
4434 /* Cookie freed for us */
4435 break;
4436 }
4437 }
4438 }
4439
4440#undef SKIP_T
4441#undef FIND_T
4442
4443 fclose(fp);
4444}
4445
4446
4447/* exported interface documented in content/urldb.h */
4448void urldb_save_cookies(const char *filename)
4449{
4450 FILE *fp;
4451 int cookie_file_version = max(loaded_cookie_file_version,
4453
4454 assert(filename);
4455
4456 fp = fopen(filename, "w");
4457 if (!fp)
4458 return;
4459
4460 fprintf(fp, "# NetSurf cookies file.\n"
4461 "#\n"
4462 "# Lines starting with a '#' are comments, "
4463 "blank lines are ignored.\n"
4464 "#\n"
4465 "# All lines prior to \"Version:\t%d\" are discarded.\n"
4466 "#\n"
4467 "# Version\tDomain\tDomain from Set-Cookie\tPath\t"
4468 "Path from Set-Cookie\tSecure\tHTTP-Only\tExpires\tLast used\t"
4469 "No destroy\tName\tValue\tValue was quoted\tScheme\t"
4470 "URL\tComment\n",
4471 cookie_file_version);
4472 fprintf(fp, "Version:\t%d\n", cookie_file_version);
4473
4475
4476 fclose(fp);
4477}
4478
4479
4480/* exported interface documented in netsurf/url_db.h */
4481void urldb_dump(void)
4482{
4483 int i;
4484
4486
4487 for (i = 0; i != NUM_SEARCH_TREES; i++) {
4489 }
4490}
4491
4492
4493
4494
Helpers for ASCII string handling.
static char ascii_to_lower(char c)
Convert an upper case character to lower case.
Definition: ascii.h:212
static bool ascii_is_alpha(char c)
Test whether a character is alphabetical (upper or lower case).
Definition: ascii.h:75
static bool ascii_is_digit(char c)
Test whether a character is a decimal digit.
Definition: ascii.h:86
static char version[32]
Definition: about.c:45
struct bloom_filter * bloom_create(size_t size)
Create a new bloom filter.
Definition: bloom.c:59
void bloom_destroy(struct bloom_filter *b)
Destroy a previously-created bloom filter.
Definition: bloom.c:71
bool bloom_search_hash(struct bloom_filter *b, uint32_t hash)
Search the filter for the given hash value, assuming it was added by bloom_insert_hash().
Definition: bloom.c:98
void bloom_insert_hash(struct bloom_filter *b, uint32_t hash)
Insert a given hash value into the filter, should you already have one to hand.
Definition: bloom.c:82
Trivial bloom filter.
static uint32_t count(const http_directive *list, lwc_string *key)
char * strndup(const char *s, size_t n)
Duplicate up to n characters of a string.
Definition: utils.c:332
Content handling interface.
content_type
The type of a content.
Definition: content_type.h:53
cookie_version
Version of cookie.
Definition: cookie_db.h:38
@ COOKIE_RFC2965
Definition: cookie_db.h:41
@ COOKIE_NETSCAPE
Definition: cookie_db.h:39
bool cookie_manager_add(const struct cookie_data *data)
Add/update a cookie to the viewer.
void cookie_manager_remove(const struct cookie_data *data)
Remove a cookie from viewer.
Cookie Manager (interface).
Useful interned string pointers (interface).
wimp_w parent
Definition: dialog.c:88
nserror
Enumeration of error codes.
Definition: errors.h:29
@ NSERROR_SAVE_FAILED
Failed to save data.
Definition: errors.h:36
@ NSERROR_NOT_FOUND
Requested item not found.
Definition: errors.h:34
@ NSERROR_NEED_DATA
More data needed.
Definition: errors.h:46
@ NSERROR_INVALID
Invalid data.
Definition: errors.h:49
@ NSERROR_NOMEM
Memory exhaustion.
Definition: errors.h:32
@ NSERROR_OK
No error.
Definition: errors.h:30
static struct directory * root
Definition: filename.c:55
const char * type
Definition: filetype.cpp:44
HTTP header parsing functions.
Generic bitmap handling interface.
internet structures and defines
int inet_aton(const char *cp, struct in_addr *inp)
Definition: utils.c:489
int inet_pton(int af, const char *src, void *dst)
Definition: utils.c:512
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116
NetSurf URL handling (interface).
nserror nsurl_create(const char *const url_s, nsurl **url)
Create a NetSurf URL object from a URL string.
nserror nsurl_defragment(const nsurl *url, nsurl **no_frag)
Create a NetSurf URL object without a fragment from a NetSurf URL.
void nsurl_unref(nsurl *url)
Drop a reference to a NetSurf URL object.
uint32_t nsurl_hash(const nsurl *url)
Get a URL's hash value.
nserror nsurl_get(const nsurl *url, nsurl_component parts, char **url_s, size_t *url_l)
Get URL (section) as a string, from a NetSurf URL object.
const char * nsurl_access(const nsurl *url)
Access a NetSurf URL object as a string.
lwc_string * nsurl_get_component(const nsurl *url, nsurl_component part)
Get part of a URL as a lwc_string, from a NetSurf URL object.
@ NSURL_FRAGMENT
Definition: nsurl.h:56
@ NSURL_SCHEME
Definition: nsurl.h:45
@ NSURL_PATH
Definition: nsurl.h:52
@ NSURL_HOST
Definition: nsurl.h:49
@ NSURL_PORT
Definition: nsurl.h:50
@ NSURL_QUERY
Definition: nsurl.h:53
struct nsurl nsurl
NetSurf URL object.
Definition: nsurl.h:31
nserror http_parse_strict_transport_security(const char *header_value, http_strict_transport_security **result)
Parse an HTTP Strict-Transport-Security header value.
void http_strict_transport_security_destroy(http_strict_transport_security *victim)
Destroy a strict transport security object.
bool http_strict_transport_security_include_subdomains(http_strict_transport_security *sts)
Get the value of a strict transport security's includeSubDomains flag.
uint32_t http_strict_transport_security_max_age(http_strict_transport_security *sts)
Get the value of a strict transport security's max-age.
Interface to utility string handling.
const struct cookie_data * next
Next in list.
Definition: cookie_db.h:46
cookie entry.
Definition: urldb.c:124
time_t expires
Expiry timestamp, or -1 for session.
Definition: urldb.c:136
struct cookie_internal_data * prev
Previous in list.
Definition: urldb.c:125
enum cookie_version version
Specification compliance.
Definition: urldb.c:140
char * path
Path.
Definition: urldb.c:135
time_t last_used
Last used time.
Definition: urldb.c:137
struct cookie_internal_data * next
Next in list.
Definition: urldb.c:126
char * domain
Domain.
Definition: urldb.c:133
bool http_only
Only expose to HTTP(S) requests.
Definition: urldb.c:139
bool path_from_set
Path came from Set-Cookie: header.
Definition: urldb.c:134
char * comment
Cookie comment.
Definition: urldb.c:131
bool domain_from_set
Domain came from Set-Cookie: header.
Definition: urldb.c:132
bool no_destroy
Never destroy this cookie, unless it's expired.
Definition: urldb.c:141
bool value_was_quoted
Value was quoted in Set-Cookie:
Definition: urldb.c:130
char * name
Cookie name.
Definition: urldb.c:128
bool secure
Only send for HTTPS requests.
Definition: urldb.c:138
char * value
Cookie value.
Definition: urldb.c:129
struct host_part * parent
Parent host part.
Definition: urldb.c:257
struct host_part * prev
Previous sibling.
Definition: urldb.c:256
struct host_part * children
Child host parts.
Definition: urldb.c:258
char * part
Part of host string.
Definition: urldb.c:247
struct path_data paths
Known paths on this host.
Definition: urldb.c:235
struct host_part * next
Next sibling.
Definition: urldb.c:255
bool permit_invalid_certs
Allow access to SSL protected resources on this host without verifying certificate authenticity.
Definition: urldb.c:240
struct hsts_data hsts
Definition: urldb.c:242
struct prot_space_data * prot_space
Linked list of all known proctection spaces known for this host and all its schems and ports.
Definition: urldb.c:253
time_t expires
Expiry time.
Definition: urldb.c:226
bool include_sub_domains
Whether to include subdomains.
Definition: urldb.c:227
Representation of a Strict-Transport-Security.
data entry for url
Definition: urldb.c:194
bool persistent
This entry should persist.
Definition: urldb.c:203
struct url_internal_data urld
URL data for resource.
Definition: urldb.c:205
char * segment
Path segment for this node.
Definition: urldb.c:200
unsigned int frag_cnt
Number of entries in path_data::fragment.
Definition: urldb.c:201
struct path_data * last
Last child.
Definition: urldb.c:222
char ** fragment
Array of fragments.
Definition: urldb.c:202
const struct prot_space_data * prot_space
Protection space to which this resource belongs too.
Definition: urldb.c:212
struct path_data * next
Next sibling.
Definition: urldb.c:218
struct cookie_internal_data * cookies
Cookies associated with resource.
Definition: urldb.c:214
struct cookie_internal_data * cookies_end
Last cookie in list.
Definition: urldb.c:216
nsurl * url
Full URL.
Definition: urldb.c:195
struct path_data * children
Child path segments.
Definition: urldb.c:221
struct path_data * prev
Previous sibling.
Definition: urldb.c:219
lwc_string * scheme
URL scheme for data.
Definition: urldb.c:196
unsigned int port
Port number for data.
Definition: urldb.c:197
struct path_data * parent
Parent path segment.
Definition: urldb.c:220
A protection space.
Definition: urldb.c:154
struct prot_space_data * next
Next sibling.
Definition: urldb.c:174
char * auth
Authentication details for this protection space in form username:password.
Definition: urldb.c:172
char * realm
Protection realm.
Definition: urldb.c:166
unsigned int port
Port number of canonical hostname of this protection space.
Definition: urldb.c:164
lwc_string * scheme
URL scheme of canonical hostname of this protection space.
Definition: urldb.c:158
search index node
Definition: urldb.c:265
unsigned int level
Node level.
Definition: urldb.c:268
struct search_node * left
Left subtree.
Definition: urldb.c:270
struct search_node * right
Right subtree.
Definition: urldb.c:271
const struct host_part * data
Host tree entry.
Definition: urldb.c:266
meta data about a url
Definition: urldb.c:183
content_type type
Type of resource.
Definition: urldb.c:187
char * title
Resource title.
Definition: urldb.c:184
time_t last_visit
Last visit time.
Definition: urldb.c:186
unsigned int visits
Visit count.
Definition: urldb.c:185
int nsc_sntimet(char *str, size_t size, time_t *timep)
Write the time in seconds since epoch to a buffer.
Definition: time.c:126
nserror nsc_strntimet(const char *str, size_t size, time_t *timep)
Converts a date string to a number of seconds since epoch.
Definition: time.c:980
nserror nsc_snptimet(const char *str, size_t size, time_t *timep)
Parse time in seconds since epoc.
Definition: time.c:147
Interface to time operations.
Interface to URL parsing and joining operations.
void urldb_set_cert_permissions(nsurl *url, bool permit)
Set certificate verification permissions.
Definition: urldb.c:3455
#define MAXIMUM_URL_LENGTH
static void urldb_destroy_path_tree(struct path_data *root)
Destroy a path tree.
Definition: urldb.c:2756
void urldb_destroy(void)
Destroy urldb.
Definition: urldb.c:2847
static struct search_node * urldb_search_skew(struct search_node *root)
Rotate a subtree right.
Definition: urldb.c:1592
static struct host_part * urldb_add_host_node(const char *part, struct host_part *parent)
Add a host node to the tree.
Definition: urldb.c:1006
static struct search_node ** urldb_get_search_tree_direct(const char *host)
Get the search tree for a particular host.
Definition: urldb.c:1172
static struct path_data * urldb_add_path_node(lwc_string *scheme, unsigned int port, const char *segment, lwc_string *fragment, struct path_data *parent)
Add a path node to the tree.
Definition: urldb.c:1104
#define GROW_MATCHED_COOKIES
static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent)
Save a host subtree's cookies.
Definition: urldb.c:2675
static struct search_node empty
Definition: urldb.c:282
static struct search_node * urldb_search_insert_internal(struct search_node *root, struct search_node *n)
Insert node into search tree.
Definition: urldb.c:1642
#define SKIP_T
static struct search_node * urldb_get_search_tree(const char *host)
Get the search tree for a particular host.
Definition: urldb.c:1191
static void urldb_destroy_search_tree(struct search_node *root)
Destroy a search tree.
Definition: urldb.c:2830
#define NUM_SEARCH_TREES
Search trees - one per letter + 1 for IPs + 1 for Everything Else.
Definition: urldb.c:278
static void urldb_free_cookie(struct cookie_internal_data *c)
Free a cookie.
Definition: urldb.c:1817
#define ST_IP
Definition: urldb.c:279
nserror urldb_set_url_persistence(nsurl *url, bool persist)
Set the cross-session persistence of the entry for an URL.
Definition: urldb.c:3122
static bool urldb_insert_cookie(struct cookie_internal_data *c, lwc_string *scheme, nsurl *url)
Insert a cookie into the database.
Definition: urldb.c:2257
static void urldb_write_paths(const struct path_data *parent, const char *host, FILE *fp, char **path, int *path_alloc, int *path_used, time_t expiry)
Write paths associated with a host.
Definition: urldb.c:353
bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer)
Parse Set-Cookie header and insert cookie(s) into database.
Definition: urldb.c:3734
static bool urldb_iterate_entries_host(struct search_node *parent, bool(*url_callback)(nsurl *url, const struct url_data *data), bool(*cookie_callback)(const struct cookie_data *data))
Host data iterator (internal)
Definition: urldb.c:963
static int urldb_search_match_prefix(const struct host_part *a, const char *b)
Compare host_part with prefix.
Definition: urldb.c:746
static nserror urldb_write_timet(FILE *fp, time_t val)
write a time_t to a file portably
Definition: urldb.c:327
void urldb_iterate_cookies(bool(*callback)(const struct cookie_data *data))
Definition: urldb.c:3719
bool urldb_get_hsts_enabled(struct nsurl *url)
Determine if HSTS policy is enabled for an URL.
Definition: urldb.c:3573
#define URL_FILE_VERSION
Current URL database file version.
Definition: urldb.c:300
void urldb_save_cookies(const char *filename)
Save persistent cookies to file.
Definition: urldb.c:4448
nsurl * urldb_get_url(nsurl *url)
Extract an URL from the db.
Definition: urldb.c:3327
static struct path_data * urldb_add_path_fragment(struct path_data *segment, lwc_string *fragment)
Add a fragment to a path segment.
Definition: urldb.c:1055
nserror urldb_set_url_title(nsurl *url, const char *title)
Set an URL's title string, replacing any existing one.
Definition: urldb.c:3226
nserror urldb_save(const char *filename)
Export the current database to file.
Definition: urldb.c:3094
bool urldb_add_url(nsurl *url)
Insert an URL into the database.
Definition: urldb.c:3140
static struct search_node * urldb_search_split(struct search_node *root)
Rotate a node left, increasing the parent's level.
Definition: urldb.c:1615
static void urldb_dump_search(struct search_node *parent, int depth)
Dump search tree.
Definition: urldb.c:1504
void urldb_dump(void)
Dump URL database to stderr.
Definition: urldb.c:4481
static bool urldb_iterate_partial_path(const struct path_data *parent, const char *prefix, bool(*callback)(nsurl *url, const struct url_data *data))
Partial path iterator (internal)
Definition: urldb.c:906
#define ST_DN
Definition: urldb.c:281
static void urldb_delete_cookie_paths(const char *domain, const char *path, const char *name, struct path_data *parent)
deletes paths from a cookie.
Definition: urldb.c:2538
#define MIN_COOKIE_FILE_VERSION
Minimum cookie database file version.
Definition: urldb.c:291
void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth)
Set authentication data for an URL.
Definition: urldb.c:3342
#define BLOOM_SIZE
Size of url filter.
Definition: urldb.c:317
bool urldb_get_cert_permissions(nsurl *url)
Retrieve certificate verification permissions from database.
Definition: urldb.c:3480
static int loaded_cookie_file_version
loaded cookie file version
Definition: urldb.c:295
bool urldb_set_hsts_policy(struct nsurl *url, const char *header)
Set HSTS policy for an URL.
Definition: urldb.c:3502
static bool urldb_iterate_entries_path(const struct path_data *parent, bool(*url_callback)(nsurl *url, const struct url_data *data), bool(*cookie_callback)(const struct cookie_data *data))
Path data iterator (internal)
Definition: urldb.c:584
static void urldb_dump_paths(struct path_data *parent)
Dump URL database paths to stderr.
Definition: urldb.c:1438
static void urldb_delete_cookie_hosts(const char *domain, const char *path, const char *name, struct host_part *parent)
Deletes cookie hosts and their assoicated paths.
Definition: urldb.c:2596
nserror urldb_load(const char *filename)
Import an URL database from file, replacing any existing database.
Definition: urldb.c:2876
static struct path_data * urldb_add_path(lwc_string *scheme, unsigned int port, const struct host_part *host, char *path_query, lwc_string *fragment, nsurl *url)
Add a path to the database, creating any intermediate entries.
Definition: urldb.c:2086
void urldb_iterate_entries(bool(*callback)(nsurl *url, const struct url_data *data))
Definition: urldb.c:3702
static struct host_part db_root
Root database handle.
Definition: urldb.c:275
static struct path_data * urldb_find_url(nsurl *url)
Find an URL in the database.
Definition: urldb.c:1358
static struct host_part * urldb_add_host(const char *host)
Add a host to the database, creating any intermediate entries.
Definition: urldb.c:2164
const struct url_data * urldb_get_url_data(nsurl *url)
Find data for an URL.
Definition: urldb.c:3309
static void urldb_destroy_prot_space(struct prot_space_data *space)
Destroy protection space data.
Definition: urldb.c:2741
const char * urldb_get_auth_details(nsurl *url, const char *realm)
Look up authentication details in database.
Definition: urldb.c:3405
static void urldb_destroy_path_node_content(struct path_data *node)
Destroy the contents of a path node.
Definition: urldb.c:2709
static void urldb_save_search_tree(struct search_node *parent, FILE *fp)
Save a search (sub)tree.
Definition: urldb.c:514
static const struct host_part * urldb_search_find(struct search_node *root, const char *host)
Find a node in a search tree.
Definition: urldb.c:1270
static struct search_node * search_trees[NUM_SEARCH_TREES]
Definition: urldb.c:283
nserror urldb_update_url_visit_data(nsurl *url)
Update an URL's visit data.
Definition: urldb.c:3274
static void urldb_destroy_host_tree(struct host_part *root)
Destroy a host tree.
Definition: urldb.c:2792
static struct bloom_filter * url_bloom
filter for url presence in database
Definition: urldb.c:313
static int urldb_add_path_fragment_cmp(const void *a, const void *b)
Fragment comparator callback for qsort.
Definition: urldb.c:1041
void urldb_reset_url_visit_data(nsurl *url)
Reset an URL's visit statistics.
Definition: urldb.c:3293
static struct search_node * urldb_search_insert(struct search_node *root, const struct host_part *data)
Insert a node into the search tree.
Definition: urldb.c:1679
#define FIND_T
#define MIN_URL_FILE_VERSION
Minimum URL database file version.
Definition: urldb.c:298
static bool urldb_concat_cookie(struct cookie_internal_data *c, int version, int *used, int *alloc, char **buf)
Concatenate a cookie into the provided buffer.
Definition: urldb.c:2376
static int urldb_search_match_host(const struct host_part *a, const struct host_part *b)
Compare a pair of host parts.
Definition: urldb.c:1554
nserror urldb_set_url_content_type(nsurl *url, content_type type)
Set an URL's content type.
Definition: urldb.c:3256
static struct cookie_internal_data * urldb_parse_cookie(nsurl *url, const char **cookie)
Parse a cookie.
Definition: urldb.c:1838
void urldb_delete_cookie(const char *domain, const char *path, const char *name)
Delete a cookie.
Definition: urldb.c:4273
static void urldb_dump_hosts(struct host_part *parent)
Dump URL database hosts to stderr.
Definition: urldb.c:1477
static void urldb_destroy_cookie(struct cookie_internal_data *c)
Destroy a cookie node.
Definition: urldb.c:2692
static void urldb_count_urls(const struct path_data *root, time_t expiry, unsigned int *count)
Count number of URLs associated with a host.
Definition: urldb.c:474
char * urldb_get_cookie(nsurl *url, bool include_http_only)
Retrieve cookies for an URL.
Definition: urldb.c:3997
static bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, char *v, bool was_quoted)
Parse a cookie avpair.
Definition: urldb.c:1709
static struct path_data * urldb_match_path(const struct path_data *parent, const char *path, lwc_string *scheme, unsigned short port)
Match a path string.
Definition: urldb.c:1302
static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent)
Save a path subtree's cookies.
Definition: urldb.c:2618
#define ST_EE
Definition: urldb.c:280
void urldb_iterate_partial(const char *prefix, bool(*callback)(nsurl *url, const struct url_data *data))
Definition: urldb.c:3635
void urldb_load_cookies(const char *filename)
Load a cookie file into the database.
Definition: urldb.c:4281
static bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, bool(*callback)(nsurl *url, const struct url_data *data))
Partial host iterator (internal)
Definition: urldb.c:816
#define COOKIE_FILE_VERSION
Current cookie database file version.
Definition: urldb.c:293
static bool urldb__host_is_ip_address(const char *host)
Check whether a host string is an IP address.
Definition: urldb.c:647
static int urldb_search_match_string(const struct host_part *a, const char *b)
Compare host part with a string.
Definition: urldb.c:1204
Unified URL information database internal interface.
Option reading and saving interface.
#define nsoption_int(OPTION)
Get the value of an integer option.
Definition: nsoption.h:313
Interface to a number of general purpose functionality.
#define max(x, y)
Definition: utils.h:50
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821