#include "i.h" // function forward declarations static void split(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2); static ParsedUrl* urlfromparts(ParsedUrl* p); static Rune* canonize(Rune* s1, int n1, Rune* s2); int Ufmt(Fmt *f); // globals Rune* schemes[NSCHEMES] = { L"", L"http", L"https", L"ftp", L"file", L"gopher", L"mailto", L"news", L"nntp", L"telnet", L"wais", L"prospero", L"unknown" }; void urlinit(void) { fmtinstall('U', Ufmt); } // Parse the surl string into its components. // Return a pointer to a newly allocated ParsedUrl. // If makeabs is true, add assume an http:// is in front of surl // if surl has no // or :. ParsedUrl* makeurl(Rune* surl, int makeabs) { int i; int surllen; int scheme = NOSCHEME; Rune* x; Rune* sch = nil; int schlen = 0; Rune* url = nil; int urllen = 0; Rune* up = nil; int uplen = 0; Rune* hp = nil; int hplen = 0; Rune* netloc = nil; int netloclen = 0; int slsl; ParsedUrl u; memset(&u, 0, sizeof(ParsedUrl)); surllen = Strlen(surl); split(surl, surllen, L":", &sch, &schlen, &url, &urllen); if(urllen == 0) { url = surl; urllen = surllen; sch = nil; schlen = 0; } else { x = Strnclass(sch, L"^-a-zA-Z0-9.+", schlen); if(x != nil) { url = surl; urllen = surllen; sch = nil; schlen = 0; } else { scheme = UNKNOWN; for(i = 0; i < NSCHEMES; i++) if(!Strncmpci(sch, schlen, schemes[i])) { scheme = i; schlen = Strlen(schemes[i]); break; } } } if(scheme == MAILTO) { u.path = url; u.npath = urllen; } else { slsl = 0; if(urllen >= 2 && url[0] == '/' && url[1] == '/' ) { slsl = 1; url += 2; urllen -= 2; } else if(makeabs && scheme==NOSCHEME) { slsl = 1; scheme = HTTP; } if(slsl) { splitl(url, urllen, L"/", &netloc, &netloclen, &u.path, &u.npath); if(u.npath != 0) { u.path = u.path+1; u.npath--; } u.pstart = L"/"; u.npstart = 1; if(scheme == FILE) { u.host = netloc; u.nhost = netloclen; } else { split(netloc, netloclen, L"@", &up, &uplen, &hp, &hplen); if(hplen == 0) { hp = up; hplen = uplen; } else split(up, uplen, L":", &u.user, &u.nuser, &u.passwd, &u.npasswd); split(hp, hplen, L":", &u.host, &u.nhost, &u.port, &u.nport); } } else { if(urllen > 0 && url[0] == '/') { u.pstart = L"/"; u.npstart = 1; u.path = url+1; u.npath = urllen-1; } else { u.path = url; u.npath = urllen; } } if(scheme == FILE) { if(u.nhost == 0) { u.host = L"localhost"; u.nhost = 9; } } else { split(u.path, u.npath, L"#", &u.path, &u.npath, &u.frag, &u.nfrag); split(u.path, u.npath, L"?", &u.path, &u.npath, &u.query, &u.nquery); } } u.scheme = scheme; return urlfromparts(&u); } int Ufmt(Fmt *f) { ParsedUrl* u; Rune* su; u = va_arg(f->args, ParsedUrl*); if(u == nil) su = L""; else su = u->url; return fmtrunestrcpy(f, su); } // Return a URL that is u made absolute relative to b. // The original urls won't be touched. ParsedUrl* makeabsoluteurl(ParsedUrl* u, ParsedUrl* b) { int n; int scheme; Rune* p; Rune* path; ParsedUrl *ans; ParsedUrl t; scheme = u->scheme; // Should return if SCHEME already given (according to RFC1808, // but various extant web pages violate that rule for non-http schemes if(scheme != NOSCHEME && scheme != HTTP) return u; memset(&t, 0, sizeof(ParsedUrl)); if(u->nhost == 0 && u->npath == 0 && u->npstart == 0 && u->nquery == 0 && u->nfrag == 0) { memmove(&t, b, sizeof(ParsedUrl)); } else { memmove(&t, u, sizeof(ParsedUrl)); if(scheme == NOSCHEME) scheme = b->scheme; if(t.nhost == 0) { t.user = b->user; t.nuser = b->nuser; t.passwd = b->passwd; t.npasswd = b->npasswd; t.host = b->host; t.nhost = b->nhost; t.port = b->port; t.nport = b->nport; if(t.npstart == 0) { t.pstart = L"/"; t.npstart = 1; if(t.npath == 0) { t.path = b->path; t.npath = b->npath; if(t.nquery == 0) { t.query = b->query; t.nquery = b->nquery; } } else { n = b->npath; p = Strnrclass(b->path, L"/", n); if(p == nil) n = 0; else n = p-b->path; t.path = path = canonize(b->path, n, u->path); t.npath = Strlen(path); } } } } t.scheme = scheme; ans = urlfromparts(&t); return ans; } // Make a copy of url that has q as query. ParsedUrl* makequeryurl(ParsedUrl* url, Rune* q) { ParsedUrl t; memmove(&t, url, sizeof(ParsedUrl)); t.query = q; t.nquery = Strlen(q); return urlfromparts(&t); } // p has correct values for all its fields, except they might // point into different strings. Make a new ParsedUrl with // one contiguous string containing all the parts. static ParsedUrl* urlfromparts(ParsedUrl* p) { int n; int schlen; Rune* x; ParsedUrl* ans; if(p->scheme == NOSCHEME) schlen = 0; else schlen = Strlen(schemes[p->scheme]); n = schlen + (schlen != 0); if(p->nhost != 0) n += 2 + p->nuser + p->npasswd + p->nhost + p->nport + (p->npasswd != 0) + (p->nuser != 0) + (p->nport != 0); n += p->npstart + p->npath + p->nquery + p->nfrag + (p->nquery != 0) + (p->nfrag != 0); ans = (ParsedUrl*)emalloc(sizeof(ParsedUrl)+n*sizeof(Rune)); x = ans->url; ans->scheme = p->scheme; ans->nuser = p->nuser; ans->npasswd = p->npasswd; ans->nhost = p->nhost; ans->nport = p->nport; ans->npstart = p->npstart; ans->npath = p->npath; ans->nquery = p->nquery; ans->nfrag = p->nfrag; if(schlen != 0) { x = Stradd(x, schemes[p->scheme], schlen); *x++ = ':'; } if(p->nhost != 0) { x = Stradd(x, L"//", 2); ans->user = x; ans->passwd = x; if(p->nuser != 0) { x = Stradd(x, p->user, p->nuser); if(p->npasswd != 0) { *x++ = ':'; ans->passwd = x; x = Stradd(x, p->passwd, p->npasswd); } *x++ = '@'; } ans->host = x; ans->port = x; x = Stradd(x, p->host, p->nhost); if(p->nport != 0) { *x++ = ':'; ans->port = x; x = Stradd(x, p->port, p->nport); } } else { ans->user = x; ans->passwd = x; ans->host = x; ans->port = x; } ans->pstart = x; x = Stradd(x, L"/", p->npstart); ans->path = x; x = Stradd(x, p->path, p->npath); ans->query = x; if(p->nquery != 0) { *x++ = '?'; ans->query = x; x = Stradd(x, p->query, p->nquery); } ans->frag = x; if(p->nfrag != 0) { *x++ = '#'; x = Stradd(x, p->frag, p->nfrag); } *x++ = 0; assert(x == ans->url+n+1); ans->nurl = n+1; return ans; } // Don't include fragment in test, since we are testing if the // pointed to docs are the same, not places within docs. int urlequal(ParsedUrl* a, ParsedUrl* b) { return a->npath == b->npath && Streqn(a->path, a->npath, b->path) && a->scheme == b->scheme && a->nhost == b->nhost && Streqn(a->host, a->nhost, b->host) && a->nport == b->nport && Streqn(a->port, a->nport, b->port) && a->nuser == b->nuser && Streqn(a->user, a->nuser, b->user) && a->npasswd == b->npasswd && Streqn(a->passwd, a->npasswd, b->passwd) && a->npstart == b->npstart && Streqn(a->pstart, a->npstart, b->pstart) && a->nquery == b->nquery && Streqn(a->query, a->nquery, b->query); } // Like splitl, but assume one char match and omit that from second part. // If no split, all s goes in first component static void split(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2) { splitl(s, n, cl, p1, n1, p2, n2); if((*n2) != 0) { (*p2)++; (*n2)--; } } // Make a new string that is canonization of the path s1[0:n1]/s2. // (Canonization removes ./ and ../ from string). static Rune* canonize(Rune* s1, int n1, Rune* s2) { int k, m, n, shiftby, shiftstart; Rune* ans; Rune* r; Rune* elem[SMALLBUFSIZE]; int elen[SMALLBUFSIZE]; k = splitall(s1, n1, L"/", elem, elen, SMALLBUFSIZE-1); m = splitall(s2, Strlen(s2), L"/", elem+k, elen+k, SMALLBUFSIZE-k); n = m+k; if(n == SMALLBUFSIZE) trace("warning: url too long; truncated\n"); for(k = 0; k < n; ) { m = elen[k]; if(m <= 2) { shiftby = 0; shiftstart = 0; r = elem[k]; if(r[0] == '.') { if(m == 1) { shiftstart = k; shiftby = 1; } else if(r[1] == '.') { shiftstart = k-1; shiftby = 2; if(shiftstart == -1) { shiftstart = 0; shiftby = 1; } } } if(shiftby != 0) { for(m = shiftstart+shiftby; m < n; m++) { elem[m-shiftby] = elem[m]; elen[m-shiftby] = elen[m]; } n -= shiftby; k = shiftstart; continue; } } k++; } if(n == 0) return nil; m = 0; for(k = 0; k < n; k++) m += elen[k]; m += k-1; ans = emalloc((m+1)*sizeof(Rune)); r = ans; for(k = 0; k < n; k++) { r = Stradd(r, elem[k], elen[k]); if(k < n-1) *r++ = '/'; } *r = 0; return ans; } ParsedUrl* copyurl(ParsedUrl* url) { return urlfromparts(url); } // For debugging static int validurlpart(ParsedUrl* u, int n, int npart, Rune* part) { return npart == 0 || (npart > 0 && part != nil && &u->url[0] <= part && part+npart <= &u->url[n]); } int validurl(ParsedUrl* u) { int n; if(u == nil || u->scheme < NOSCHEME || u->scheme >= NSCHEMES) return 0; else { n = (u->scheme == NOSCHEME)? 0 : Strlen(schemes[u->scheme])+1; if(u->nhost != 0) n += 2 + u->nuser + u->npasswd + u->nhost + u->nport + (u->npasswd != 0) + (u->nuser != 0) + (u->nport != 0); n += u->npstart + u->npath + u->nquery + u->nfrag + (u->nquery != 0) + (u->nfrag != 0); return n == u->nurl && u->url[n] == 0 && validurlpart(u, n, u->nuser, u->user) && validurlpart(u, n, u->npasswd, u->passwd) && validurlpart(u, n, u->nhost, u->host) && validurlpart(u, n, u->nport, u->port) && (u->npstart == 0 || (u->npstart == 1 && u->pstart[0] == L'/')) && validurlpart(u, n, u->npath, u->path) && validurlpart(u, n, u->nquery, u->query) && validurlpart(u, n, u->nfrag, u->frag); } }