url.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2000-2015, University of Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(url,
   36          [ parse_url/2,                % +URL, -Parts | -URL +Parts
   37            parse_url/3,                % +URL|URI, +BaseURL, -Parts
   38                                        % -URL, +BaseURL, +Parts
   39            is_absolute_url/1,          % +URL
   40            global_url/3,               % +Local, +Base, -Global
   41            http_location/2,            % ?Parts, ?Location
   42            www_form_encode/2,          % Value <-> Encoded
   43            parse_url_search/2,         % Form-data <-> Form fields
   44
   45            url_iri/2,                  % ?URL, ?IRI
   46
   47            file_name_to_url/2,         % ?FileName, ?URL
   48
   49            set_url_encoding/2          % ?Old, +New
   50          ]).   51:- autoload(library(error),[must_be/2,representation_error/1]).   52:- autoload(library(lists),[append/3,select/3,delete/3]).   53:- autoload(library(utf8),[utf8_codes/3]).   54
   55
   56/** <module> Analysing and constructing URL
   57
   58This library deals with the analysis and construction of a URL,
   59Universal Resource Locator. URL is the basis for communicating locations
   60of resources (data) on the web. A URL consists of a protocol identifier
   61(e.g. HTTP, FTP, and a protocol-specific syntax further defining the
   62location. URLs are standardized in RFC-1738.
   63
   64The implementation in this library covers only a small portion of the
   65defined protocols.  Though the initial implementation followed RFC-1738
   66strictly, the current is more relaxed to deal with frequent violations
   67of the standard encountered in practical use.
   68
   69@author Jan Wielemaker
   70@author Lukas Faulstich
   71@deprecated New code should use library(uri), provided by the =clib=
   72            package.
   73*/
   74
   75                 /*******************************
   76                 *            GLOBALISE         *
   77                 *******************************/
   78
   79%!  global_url(+URL, +Base, -Global) is det.
   80%
   81%   Translate a possibly relative URL  into   an  absolute  one.
   82%
   83%   @error syntax_error(illegal_url) if URL is not legal.
   84
   85global_url(URL, BaseURL, Global) :-
   86    (   is_absolute_url(URL),
   87        \+ sub_atom(URL, _, _, _, '%')      % may have escape, use general
   88    ->  Global = URL
   89    ;   sub_atom(URL, 0, _, _, '//')
   90    ->  parse_url(BaseURL, [], Attributes),
   91        memberchk(protocol(Proto), Attributes),
   92        atomic_list_concat([Proto, :, URL], Global)
   93    ;   sub_atom(URL, 0, _, _, #)
   94    ->  (   sub_atom(BaseURL, _, _, 0, #)
   95        ->  sub_atom(URL, 1, _, 0, NoHash),
   96            atom_concat(BaseURL, NoHash, Global)
   97        ;   atom_concat(BaseURL, URL, Global)
   98        )
   99    ;   parse_url(URL, BaseURL, Attributes)
  100    ->  phrase(curl(Attributes), Chars),
  101        atom_codes(Global, Chars)
  102    ;   throw(error(syntax_error(illegal_url), URL))
  103    ).
  104
  105%!  is_absolute_url(+URL)
  106%
  107%   True if URL is an absolute URL. That  is, a URL that starts with
  108%   a protocol identifier.
  109
  110is_absolute_url(URL) :-
  111    sub_atom(URL, 0, _, _, 'http://'),
  112    !.
  113is_absolute_url(URL) :-
  114    sub_atom(URL, 0, _, _, 'https://'),
  115    !.
  116is_absolute_url(URL) :-
  117    sub_atom(URL, 0, _, _, 'ftp://'),
  118    !.
  119is_absolute_url(URL) :-
  120    sub_atom(URL, 0, _, _, 'file://'),
  121    !.
  122is_absolute_url(URL) :-
  123    atom_codes(URL, Codes),
  124    phrase(absolute_url, Codes, _),
  125    !.
  126
  127
  128                 /*******************************
  129                 *        CREATE URL/URI        *
  130                 *******************************/
  131
  132%!  http_location(?Parts, ?Location)
  133%
  134%   Construct or analyze an  HTTP  location.   This  is  similar  to
  135%   parse_url/2, but only deals with the   location  part of an HTTP
  136%   URL. That is, the path, search   and fragment specifiers. In the
  137%   HTTP protocol, the first line of a message is
  138%
  139%       ==
  140%       <Action> <Location> HTTP/<version>
  141%       ==
  142%
  143%   @param Location Atom or list of character codes.
  144
  145http_location(Parts, Location) :-       % Parts --> Location
  146    nonvar(Parts),
  147    !,
  148    phrase(curi(Parts), String),
  149    !,
  150    atom_codes(Location, String).
  151http_location(Parts, Location) :-       % Location --> Parts
  152    atom(Location),
  153    !,
  154    atom_codes(Location, Codes),
  155    phrase(http_location(Parts), Codes).
  156http_location(Parts, Codes) :-          % LocationCodes --> Parts
  157    is_list(Codes),
  158    phrase(http_location(Parts), Codes).
  159
  160
  161curl(A) -->
  162    { memberchk(protocol(Protocol), A)
  163    },
  164    !,
  165    catomic(Protocol),
  166    ":",
  167    curl(Protocol, A).
  168curl(A) -->
  169    curl(http, A).
  170
  171curl(file, A) -->
  172    !,
  173    (   "//"
  174    ->  cpath(A)
  175    ;   cpath(A)
  176    ).
  177curl(_, A) -->
  178    "//",
  179    cuser(A),
  180    chost(A),
  181    cport(A),
  182    cpath(A),
  183    csearch(A),
  184    cfragment(A).
  185
  186curi(A) -->
  187    cpath(A),
  188    csearch(A).
  189
  190cpath(A) -->
  191    (   { memberchk(path(Path), A) }
  192    ->  { atom_codes(Path, Codes) },
  193        www_encode(Codes, [0'/, 0'+, 0':, 0',])
  194    ;   ""
  195    ).
  196
  197cuser(A) -->
  198    (   { memberchk(user(User), A) }
  199    ->  { atom_codes(User, Codes) },
  200        www_encode(Codes, [0':]),
  201        "@"
  202    ;   ""
  203    ).
  204
  205chost(A) -->
  206    (   { memberchk(host(Host), A) }
  207    ->  { atom_codes(Host, Codes) },
  208        www_encode(Codes, [])
  209    ;   ""
  210    ).
  211
  212cport(A) -->
  213    (   { memberchk(port(Port), A), Port \== 80 }
  214    ->  { number_codes(Port, Codes) },
  215        ":",
  216        www_encode(Codes, [])
  217    ;   ""
  218    ).
  219
  220
  221catomic(A, In, Out) :-
  222    atom_codes(A, Codes),
  223    append(Codes, Out, In).
  224
  225%!  csearch(+Attributes)//
  226
  227csearch(A)-->
  228    (   { memberchk(search(Parameters), A) }
  229    ->  csearch(Parameters, [0'?])
  230    ;   []
  231    ).
  232
  233csearch([], _) -->
  234    [].
  235csearch([Parameter|Parameters], Sep) -->
  236    !,
  237    codes(Sep),
  238    cparam(Parameter),
  239    csearch(Parameters, [0'&]).
  240
  241cparam(Name=Value) -->
  242    !,
  243    cname(Name),
  244    "=",
  245    cvalue(Value).
  246cparam(NameValue) -->                   % allow to feed Name(Value)
  247    { compound(NameValue),
  248      !,
  249      NameValue =.. [Name,Value]
  250    },
  251    cname(Name),
  252    "=",
  253    cvalue(Value).
  254cparam(Name)-->
  255    cname(Name).
  256
  257codes([]) --> [].
  258codes([H|T]) --> [H], codes(T).
  259
  260cname(Atom) -->
  261    { atom_codes(Atom, Codes) },
  262    www_encode(Codes, []).
  263
  264%!  cvalue(+Value)// is det.
  265%
  266%   Construct a string from  Value.  Value   is  either  atomic or a
  267%   code-list.
  268
  269cvalue(Value) -->
  270    { atomic(Value),
  271      !,
  272      atom_codes(Value, Codes)
  273    },
  274    www_encode(Codes, []).
  275cvalue(Codes) -->
  276    { must_be(codes, Codes)
  277    },
  278    www_encode(Codes, []).
  279
  280
  281%!  cfragment(+Attributes)//
  282
  283cfragment(A) -->
  284    { memberchk(fragment(Frag), A),
  285      !,
  286      atom_codes(Frag, Codes)
  287    },
  288    "#",
  289    www_encode(Codes, []).
  290cfragment(_) -->
  291    "".
  292
  293
  294                 /*******************************
  295                 *            PARSING           *
  296                 *******************************/
  297
  298%!  parse_url(?URL, ?Attributes) is det.
  299%
  300%   Construct or analyse a URL. URL is an   atom  holding a URL or a
  301%   variable. Attributes is a list of  components. Each component is
  302%   of the format Name(Value). Defined components are:
  303%
  304%       * protocol(Protocol)
  305%       The used protocol. This is, after  the optional =|url:|=, an
  306%       identifier separated from the remainder of  the URL using :.
  307%       parse_url/2 assumes the =http= protocol   if  no protocol is
  308%       specified and the URL can be parsed  as a valid HTTP url. In
  309%       addition to the RFC-1738  specified   protocols,  the =file=
  310%       protocol is supported as well.
  311%
  312%       * host(Host)
  313%       Host-name or IP-address on which   the  resource is located.
  314%       Supported by all network-based protocols.
  315%
  316%       * port(Port)
  317%       Integer port-number to access on   the \arg{Host}. This only
  318%       appears if the port is  explicitly   specified  in  the URL.
  319%       Implicit default ports (e.g., 80 for   HTTP) do _not_ appear
  320%       in the part-list.
  321%
  322%       * path(Path)
  323%       (File-) path addressed by the URL. This is supported for the
  324%       =ftp=, =http= and =file= protocols. If  no path appears, the
  325%       library generates the path =|/|=.
  326%
  327%       * search(ListOfNameValue)
  328%       Search-specification of HTTP URL. This is the part after the
  329%       =|?|=, normally used to transfer data   from HTML forms that
  330%       use the HTTP GET method.  In  the   URL  it  consists  of  a
  331%       www-form-encoded list of Name=Value pairs. This is mapped to
  332%       a list of Prolog Name=Value  terms   with  decoded names and
  333%       values.
  334%
  335%       * fragment(Fragment)
  336%       Fragment specification of HTTP URL. This   is the part after
  337%       the =|#|= character.
  338%
  339%   The example below illustrates all of this for an HTTP URL.
  340%
  341%       ==
  342%       ?- parse_url('http://www.xyz.org/hello?msg=Hello+World%21#x',
  343%              P).
  344%
  345%       P = [ protocol(http),
  346%             host('www.xyz.org'),
  347%             fragment(x),
  348%             search([ msg = 'Hello World!'
  349%                    ]),
  350%             path('/hello')
  351%           ]
  352%       ==
  353%
  354%   By instantiating the parts-list this predicate   can  be used to
  355%   create a URL.
  356
  357parse_url(URL, Attributes) :-
  358    nonvar(URL),
  359    !,
  360    atom_codes(URL, Codes),
  361    phrase(url(Attributes), Codes).
  362parse_url(URL, Attributes) :-
  363    phrase(curl(Attributes), Codes),
  364    !,
  365    atom_codes(URL, Codes).
  366
  367%!  parse_url(+URL, +BaseURL, -Attributes) is det.
  368%
  369%   Similar to parse_url/2 for relative URLs.  If URL is relative,
  370%   it is resolved using the absolute URL BaseURL.
  371
  372parse_url(URL, BaseURL, Attributes) :-
  373    nonvar(URL),
  374    !,
  375    atom_codes(URL, Codes),
  376    (   phrase(absolute_url, Codes, _)
  377    ->  phrase(url(Attributes), Codes)
  378    ;   (   atomic(BaseURL)
  379        ->  parse_url(BaseURL, BaseA0)
  380        ;   BaseA0 = BaseURL
  381        ),
  382        select(path(BasePath), BaseA0, BaseA1),
  383        delete(BaseA1, search(_), BaseA2),
  384        delete(BaseA2, fragment(_), BaseA3),
  385        phrase(relative_uri(URIA0), Codes),
  386        select(path(LocalPath), URIA0, URIA1),
  387        !,
  388        globalise_path(LocalPath, BasePath, Path),
  389        append(BaseA3, [path(Path)|URIA1], Attributes)
  390    ).
  391parse_url(URL, BaseURL, Attributes) :-
  392    parse_url(BaseURL, BaseAttributes),
  393    memberchk(path(BasePath), BaseAttributes),
  394    (   memberchk(path(LocalPath), Attributes)
  395    ->  globalise_path(LocalPath, BasePath, Path)
  396    ;   Path = BasePath
  397    ),
  398    append([path(Path)|Attributes], BaseAttributes, GlobalAttributes),
  399    phrase(curl(GlobalAttributes), Chars),
  400    atom_codes(URL, Chars).
  401
  402
  403%!  globalise_path(+LocalPath, +RelativeTo, -FullPath) is det.
  404%
  405%   The first clause deals with the  standard URL /... global paths.
  406%   The second with file://drive:path on MS-Windows.   This is a bit
  407%   of a cludge, but unfortunately common practice is -especially on
  408%   Windows- not always following the standard
  409
  410globalise_path(LocalPath, _, LocalPath) :-
  411    sub_atom(LocalPath, 0, _, _, /),
  412    !.
  413globalise_path(LocalPath, _, LocalPath) :-
  414    is_absolute_file_name(LocalPath),
  415    !.
  416globalise_path(Local, Base, Path) :-
  417    base_dir(Base, BaseDir),
  418    make_path(BaseDir, Local, Path).
  419
  420base_dir(BasePath, BaseDir) :-
  421    (   atom_concat(BaseDir, /, BasePath)
  422    ->  true
  423    ;   file_directory_name(BasePath, BaseDir)
  424    ).
  425
  426make_path(Dir, Local, Path) :-
  427    atom_concat('../', L2, Local),
  428    file_directory_name(Dir, Parent),
  429    Parent \== Dir,
  430    !,
  431    make_path(Parent, L2, Path).
  432make_path(/, Local, Path) :-
  433    !,
  434    atom_concat(/, Local, Path).
  435make_path(Dir, Local, Path) :-
  436    atomic_list_concat([Dir, /, Local], Path).
  437
  438
  439%!  absolute_url//
  440%
  441%   True if the input  describes  an   absolute  URL.  This means it
  442%   starts with a URL schema. We demand a   schema  of length > 1 to
  443%   avoid confusion with Windows drive letters.
  444
  445absolute_url -->
  446    lwalpha(_First),
  447    schema_chars(Rest),
  448    { Rest \== [] },
  449    ":",
  450    !.
  451
  452
  453                 /*******************************
  454                 *           SEQUENCES          *
  455                 *******************************/
  456
  457digits(L) -->
  458    digits(L, []).
  459
  460digits([C|T0], T) -->
  461    digit(C),
  462    !,
  463    digits(T0, T).
  464digits(T, T) -->
  465    [].
  466
  467
  468digit(C, [C|T], T) :- code_type(C, digit).
  469
  470                 /*******************************
  471                 *            RFC-3986          *
  472                 *******************************/
  473
  474%!  uri(-Parts)//
  475
  476url([protocol(Schema)|Parts]) -->
  477    schema(Schema),
  478    ":",
  479    !,
  480    hier_part(Schema, Parts, P2),
  481    query(P2, P3),
  482    fragment(P3, []).
  483url([protocol(http)|Parts]) -->         % implicit HTTP
  484    authority(Parts, [path(Path)]),
  485    path_abempty(Path).
  486
  487relative_uri(Parts) -->
  488    relative_part(Parts, P2),
  489    query(P2, P3),
  490    fragment(P3, []).
  491
  492relative_part(Parts, Tail) -->
  493    "//",
  494    !,
  495    authority(Parts, [path(Path)|Tail]),
  496    path_abempty(Path).
  497relative_part([path(Path)|T], T) -->
  498    (   path_absolute(Path)
  499    ;   path_noschema(Path)
  500    ;   path_empty(Path)
  501    ),
  502    !.
  503
  504http_location([path(Path)|P2]) -->
  505    path_abempty(Path),
  506    query(P2, P3),
  507    fragment(P3, []).
  508
  509%!  schema(-Atom)//
  510%
  511%   Schema  is  case-insensitive  and  the    canonical  version  is
  512%   lowercase.
  513%
  514%   ==
  515%   Schema ::= ALPHA *(ALPHA|DIGIT|"+"|"-"|".")
  516%   ==
  517
  518schema(Schema) -->
  519    lwalpha(C0),
  520    schema_chars(Codes),
  521    { atom_codes(Schema, [C0|Codes]) }.
  522
  523schema_chars([H|T]) -->
  524    schema_char(H),
  525    !,
  526    schema_chars(T).
  527schema_chars([]) -->
  528    [].
  529
  530schema_char(H) -->
  531    [C],
  532    { C < 128,
  533      (   code_type(C, alpha)
  534      ->  code_type(H, to_lower(C))
  535      ;   code_type(C, digit)
  536      ->  H = C
  537      ;   schema_extra(C)
  538      ->  H = C
  539      )
  540    }.
  541
  542schema_extra(0'+).
  543schema_extra(0'-).
  544schema_extra(0'.).      % 0'
  545
  546
  547%!  hier_part(+Schema, -Parts, ?Tail)//
  548
  549hier_part(file, [path(Path)|Tail], Tail) -->
  550    !,
  551    "//",
  552    (   win_drive_path(Path)
  553    ;   path_absolute(Path)
  554    ;   path_rootless(Path)
  555    ;   path_empty(Path)
  556    ),
  557    !.
  558hier_part(_, Parts, Tail) -->
  559    "//",
  560    !,
  561    authority(Parts, [path(Path)|Tail]),
  562    path_abempty(Path).
  563hier_part(_, [path(Path)|T], T) -->
  564    (   path_absolute(Path)
  565    ;   path_rootless(Path)
  566    ;   path_empty(Path)
  567    ),
  568    !.
  569
  570authority(Parts, Tail) -->
  571    user_info_chars(UserChars),
  572    "@",
  573    !,
  574    { atom_codes(User, UserChars),
  575      Parts = [user(User),host(Host)|T0]
  576    },
  577    host(Host),
  578    port(T0,Tail).
  579authority([host(Host)|T0], Tail) -->
  580    host(Host),
  581    port(T0, Tail).
  582
  583user_info_chars([H|T]) -->
  584    user_info_char(H),
  585    !,
  586    user_info_chars(T).
  587user_info_chars([]) -->
  588    [].
  589
  590user_info_char(_) --> "@", !, {fail}.
  591user_info_char(C) --> pchar(C).
  592
  593%host(Host) --> ip_literal(Host), !.            % TBD: IP6 addresses
  594host(Host) --> ip4_address(Host), !.
  595host(Host) --> reg_name(Host).
  596
  597ip4_address(Atom) -->
  598    i256_chars(Chars, [0'.|T0]),
  599    i256_chars(T0, [0'.|T1]),
  600    i256_chars(T1, [0'.|T2]),
  601    i256_chars(T2, []),
  602    { atom_codes(Atom, Chars) }.
  603
  604i256_chars(Chars, T) -->
  605    digits(Chars, T),
  606    { \+ \+ ( T = [],
  607              Chars \== [],
  608              number_codes(I, Chars),
  609              I < 256
  610            )
  611    }.
  612
  613reg_name(Host) -->
  614    reg_name_chars(Chars),
  615    { atom_codes(Host, Chars) }.
  616
  617reg_name_chars([H|T]) -->
  618    reg_name_char(H),
  619    !,
  620    reg_name_chars(T).
  621reg_name_chars([]) -->
  622    [].
  623
  624reg_name_char(C) -->
  625    pchar(C),
  626    { C \== 0':,
  627      C \== 0'@
  628    }.
  629
  630port([port(Port)|T], T) -->
  631    ":",
  632    !,
  633    digit(D0),
  634    digits(Ds),
  635    { number_codes(Port, [D0|Ds]) }.
  636port(T, T) -->
  637    [].
  638
  639path_abempty(Path) -->
  640    segments_chars(Chars, []),
  641    {   Chars == []
  642    ->  Path = '/'
  643    ;   atom_codes(Path, Chars)
  644    }.
  645
  646
  647win_drive_path(Path) -->
  648    drive_letter(C0),
  649    ":",
  650    (   "/"
  651    ->  {Codes = [C0, 0':, 0'/|Chars]}
  652    ;   {Codes = [C0, 0':|Chars]}
  653    ),
  654    segment_nz_chars(Chars, T0),
  655    segments_chars(T0, []),
  656    { atom_codes(Path, Codes) }.
  657
  658
  659path_absolute(Path) -->
  660    "/",
  661    segment_nz_chars(Chars, T0),
  662    segments_chars(T0, []),
  663    { atom_codes(Path, [0'/| Chars]) }.
  664
  665path_noschema(Path) -->
  666    segment_nz_nc_chars(Chars, T0),
  667    segments_chars(T0, []),
  668    { atom_codes(Path, Chars) }.
  669
  670path_rootless(Path) -->
  671    segment_nz_chars(Chars, T0),
  672    segments_chars(T0, []),
  673    { atom_codes(Path, Chars) }.
  674
  675path_empty('/') -->
  676    "".
  677
  678segments_chars([0'/|Chars], T) -->      % 0'
  679    "/",
  680    !,
  681    segment_chars(Chars, T0),
  682    segments_chars(T0, T).
  683segments_chars(T, T) -->
  684    [].
  685
  686segment_chars([H|T0], T) -->
  687    pchar(H),
  688    !,
  689    segment_chars(T0, T).
  690segment_chars(T, T) -->
  691    [].
  692
  693segment_nz_chars([H|T0], T) -->
  694    pchar(H),
  695    segment_chars(T0, T).
  696
  697segment_nz_nc_chars([H|T0], T) -->
  698    segment_nz_nc_char(H),
  699    !,
  700    segment_nz_nc_chars(T0, T).
  701segment_nz_nc_chars(T, T) -->
  702    [].
  703
  704segment_nz_nc_char(_) --> ":", !, {fail}.
  705segment_nz_nc_char(C) --> pchar(C).
  706
  707
  708%!  query(-Parts, ?Tail)// is det.
  709%
  710%   Extract &Name=Value, ...
  711
  712query([search(Params)|T], T) -->
  713    "?",
  714    !,
  715    search(Params).
  716query(T,T) -->
  717    [].
  718
  719search([Parameter|Parameters])-->
  720    parameter(Parameter),
  721    !,
  722    (   search_sep
  723    ->  search(Parameters)
  724    ;   { Parameters = [] }
  725    ).
  726search([]) -->
  727    [].
  728
  729parameter(Param)-->
  730    !,
  731    search_chars(NameS),
  732    { atom_codes(Name, NameS)
  733    },
  734    (   "="
  735    ->  search_value_chars(ValueS),
  736        { atom_codes(Value, ValueS),
  737          Param = (Name = Value)
  738        }
  739    ;   { Param = Name
  740        }
  741    ).
  742
  743search_chars([C|T]) -->
  744    search_char(C),
  745    !,
  746    search_chars(T).
  747search_chars([]) -->
  748    [].
  749
  750search_char(_) --> search_sep, !, { fail }.
  751search_char(_) --> "=", !, { fail }.
  752search_char(C) --> fragment_char(C).
  753
  754search_value_chars([C|T]) -->
  755    search_value_char(C),
  756    !,
  757    search_value_chars(T).
  758search_value_chars([]) -->
  759    [].
  760
  761search_value_char(_) --> search_sep, !, { fail }.
  762search_value_char(C) --> fragment_char(C).
  763
  764%!  search_sep// is semidet.
  765%
  766%   Matches a search-parameter separator.  Traditionally, this is the
  767%   &-char, but these days there are `newstyle' ;-char separators.
  768%
  769%   @see http://perldoc.perl.org/CGI.html
  770%   @tbd This should be configurable
  771
  772search_sep --> "&", !.
  773search_sep --> ";".
  774
  775
  776%!  fragment(-Fragment, ?Tail)//
  777%
  778%   Extract the fragment (after the =#=)
  779
  780fragment([fragment(Fragment)|T], T) -->
  781    "#",
  782    !,
  783    fragment_chars(Codes),
  784    { atom_codes(Fragment, Codes) }.
  785fragment(T, T) -->
  786    [].
  787
  788fragment_chars([H|T]) -->
  789    fragment_char(H),
  790    !,
  791    fragment_chars(T).
  792fragment_chars([]) -->
  793    [].
  794
  795
  796%!  fragment_char(-Char)
  797%
  798%   Find a fragment character.
  799
  800fragment_char(C)   --> pchar(C), !.
  801fragment_char(0'/) --> "/", !.
  802fragment_char(0'?) --> "?", !.
  803fragment_char(0'[) --> "[", !.          % Not according RDF3986!
  804fragment_char(0']) --> "]", !.
  805
  806
  807                 /*******************************
  808                 *      CHARACTER CLASSES       *
  809                 *******************************/
  810
  811%!  pchar(-Code)//
  812%
  813%   unreserved|pct_encoded|sub_delim|":"|"@"
  814%
  815%   Performs UTF-8 decoding of percent encoded strings.
  816
  817pchar(0'\s) --> "+", !.
  818pchar(C) -->
  819    [C],
  820    {   unreserved(C)
  821    ;   sub_delim(C)
  822    ;   C == 0':
  823    ;   C == 0'@
  824    },
  825    !.
  826pchar(C) -->
  827    percent_coded(C).
  828
  829%!  lwalpha(-C)//
  830%
  831%   Demand alpha, return as lowercase
  832
  833lwalpha(H) -->
  834    [C],
  835    { C < 128,
  836      code_type(C, alpha),
  837      code_type(H, to_lower(C))
  838    }.
  839
  840drive_letter(C) -->
  841    [C],
  842    { C < 128,
  843      code_type(C, alpha)
  844    }.
  845
  846
  847                 /*******************************
  848                 *      RESERVED CHARACTERS     *
  849                 *******************************/
  850
  851%!  sub_delim(?Code)
  852%
  853%   Sub-delimiters
  854
  855sub_delim(0'!).
  856sub_delim(0'$).
  857sub_delim(0'&).
  858sub_delim(0'').
  859sub_delim(0'().
  860sub_delim(0')).
  861sub_delim(0'*).
  862sub_delim(0'+).
  863sub_delim(0',).
  864sub_delim(0';).
  865sub_delim(0'=).
  866
  867
  868%!  unreserved(+C)
  869%
  870%   Characters that can be represented without percent escaping
  871%   RFC 3986, section 2.3
  872
  873term_expansion(unreserved(map), Clauses) :-
  874    findall(unreserved(C), unreserved_(C), Clauses).
  875
  876unreserved_(C) :-
  877    between(1, 128, C),
  878    code_type(C, alnum).
  879unreserved_(0'-).
  880unreserved_(0'.).
  881unreserved_(0'_).
  882unreserved_(0'~).                       % 0'
  883
  884unreserved(map).                        % Expanded
  885
  886
  887                 /*******************************
  888                 *              FORMS           *
  889                 *******************************/
  890
  891/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  892Encoding/decoding of form-fields  using   the  popular  www-form-encoded
  893encoding used with the HTTP GET.
  894- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  895
  896%!  www_form_encode(+Value, -XWWWFormEncoded) is det.
  897%!  www_form_encode(-Value, +XWWWFormEncoded) is det.
  898%
  899%   En/decode   to/from   application/x-www-form-encoded.   Encoding
  900%   encodes all characters  except  RFC   3986  _unreserved_  (ASCII
  901%   =alnum= (see code_type/2)), and  one   of  "-._~"  using percent
  902%   encoding.  Newline  is  mapped  to  =|%OD%OA|=.  When  decoding,
  903%   newlines appear as a single newline (10) character.
  904%
  905%   Note that a space  is  encoded   as  =|%20|=  instead  of =|+|=.
  906%   Decoding decodes both to a space.
  907%
  908%   @deprecated Use uri_encoded/3 for new code.
  909
  910www_form_encode(Value, Encoded) :-
  911    atomic(Value),
  912    !,
  913    atom_codes(Value, Codes),
  914    phrase(www_encode(Codes, []), EncCodes),
  915    atom_codes(Encoded, EncCodes).
  916www_form_encode(Value, Encoded) :-
  917    atom_codes(Encoded, EncCodes),
  918    phrase(www_decode(Codes), EncCodes),
  919    atom_codes(Value, Codes).
  920
  921%!  www_encode(+Codes, +ExtraUnescaped)//
  922
  923www_encode([0'\r, 0'\n|T], Extra) -->
  924    !,
  925    "%0D%0A",
  926    www_encode(T, Extra).
  927www_encode([0'\n|T], Extra) -->
  928    !,
  929    "%0D%0A",
  930    www_encode(T, Extra).
  931www_encode([H|T], Extra) -->
  932    percent_encode(H, Extra),
  933    www_encode(T, Extra).
  934www_encode([], _) -->
  935    "".
  936
  937percent_encode(C, _Extra) -->
  938    { unreserved(C) },
  939    !,
  940    [C].
  941percent_encode(C, Extra) -->
  942    { memberchk(C, Extra) },
  943    !,
  944    [C].
  945%percent_encode(0' , _) --> !, "+".     % Deprecated: use %20
  946percent_encode(C, _) -->
  947    { C =< 127 },
  948    !,
  949    percent_byte(C).
  950percent_encode(C, _) -->                % Unicode characters
  951    { current_prolog_flag(url_encoding, utf8),
  952      !,
  953      phrase(utf8_codes([C]), Bytes)
  954    },
  955    percent_bytes(Bytes).
  956percent_encode(C, _) -->
  957    { C =< 255 },
  958    !,
  959    percent_byte(C).
  960percent_encode(_C, _) -->
  961    { representation_error(url_character)
  962    }.
  963
  964percent_bytes([]) -->
  965    "".
  966percent_bytes([H|T]) -->
  967    percent_byte(H),
  968    percent_bytes(T).
  969
  970percent_byte(C) -->
  971    [0'%, D1, D2],
  972    {   nonvar(C)
  973    ->  Dv1 is (C>>4 /\ 0xf),
  974        Dv2 is (C /\ 0xf),
  975        code_type(D1, xdigit(Dv1)),
  976        code_type(D2, xdigit(Dv2))
  977    ;   code_type(D1, xdigit(Dv1)),
  978        code_type(D2, xdigit(Dv2)),
  979        C is ((Dv1)<<4) + Dv2
  980    }.
  981
  982percent_coded(C) -->
  983    percent_byte(C0),
  984    !,
  985    (   { C0 == 13                  % %0D%0A --> \n
  986        },
  987        "%0",
  988        ( "A" ; "a" )
  989    ->  { C = 10
  990        }
  991    ;   { C0 >= 0xc0 },             % UTF-8 lead-in
  992        utf8_cont(Cs),
  993        { phrase(utf8_codes([C]), [C0|Cs]) }
  994    ->  []
  995    ;   { C = C0
  996        }
  997    ).
  998
  999%!  www_decode(-Codes)//
 1000
 1001www_decode([0' |T]) -->
 1002    "+",
 1003    !,
 1004    www_decode(T).
 1005www_decode([C|T]) -->
 1006    percent_coded(C),
 1007    !,
 1008    www_decode(T).
 1009www_decode([C|T]) -->
 1010    [C],
 1011    !,
 1012    www_decode(T).
 1013www_decode([]) -->
 1014    [].
 1015
 1016utf8_cont([H|T]) -->
 1017    percent_byte(H),
 1018    { between(0x80, 0xbf, H) },
 1019    !,
 1020    utf8_cont(T).
 1021utf8_cont([]) -->
 1022    [].
 1023
 1024
 1025%!  set_url_encoding(?Old, +New) is semidet.
 1026%
 1027%   Query and set the encoding for URLs.  The default is =utf8=.
 1028%   The only other defined value is =iso_latin_1=.
 1029%
 1030%   @tbd    Having a global flag is highly inconvenient, but a
 1031%           work-around for old sites using ISO Latin 1 encoding.
 1032
 1033:- create_prolog_flag(url_encoding, utf8, [type(atom)]). 1034
 1035set_url_encoding(Old, New) :-
 1036    current_prolog_flag(url_encoding, Old),
 1037    (   Old == New
 1038    ->  true
 1039    ;   must_be(oneof([utf8, iso_latin_1]), New),
 1040        set_prolog_flag(url_encoding, New)
 1041    ).
 1042
 1043
 1044                 /*******************************
 1045                 *       IRI PROCESSING         *
 1046                 *******************************/
 1047
 1048%!  url_iri(+Encoded, -Decoded) is det.
 1049%!  url_iri(-Encoded, +Decoded) is det.
 1050%
 1051%   Convert between a URL, encoding in US-ASCII   and an IRI. An IRI
 1052%   is a fully expanded Unicode string.   Unicode  strings are first
 1053%   encoded into UTF-8, after which %-encoding takes place.
 1054
 1055url_iri(Encoded, Decoded) :-
 1056    nonvar(Encoded),
 1057    !,
 1058    (   sub_atom(Encoded, _, _, _, '%')
 1059    ->  atom_codes(Encoded, Codes),
 1060        unescape_precent(Codes, UTF8),
 1061        phrase(utf8_codes(Unicodes), UTF8),
 1062        atom_codes(Decoded, Unicodes)
 1063    ;   Decoded = Encoded
 1064    ).
 1065url_iri(URL, IRI) :-
 1066    atom_codes(IRI, IRICodes),
 1067    atom_codes('/:?#&=', ExtraEscapes),
 1068    phrase(www_encode(IRICodes, ExtraEscapes), UrlCodes),
 1069    atom_codes(URL, UrlCodes).
 1070
 1071
 1072unescape_precent([], []).
 1073unescape_precent([0'%,C1,C2|T0], [H|T]) :-     %'
 1074    !,
 1075    code_type(C1, xdigit(D1)),
 1076    code_type(C2, xdigit(D2)),
 1077    H is D1*16 + D2,
 1078    unescape_precent(T0, T).
 1079unescape_precent([H|T0], [H|T]) :-
 1080    unescape_precent(T0, T).
 1081
 1082
 1083                 /*******************************
 1084                 *           FORM DATA          *
 1085                 *******************************/
 1086
 1087%!  parse_url_search(?Spec, ?Fields:list(Name=Value)) is det.
 1088%
 1089%   Construct or analyze an HTTP   search  specification. This deals
 1090%   with       form       data       using       the       MIME-type
 1091%   =application/x-www-form-urlencoded=  as  used   in    HTTP   GET
 1092%   requests.
 1093
 1094parse_url_search(Spec, Fields) :-
 1095    atomic(Spec),
 1096    !,
 1097    atom_codes(Spec, Codes),
 1098    phrase(search(Fields), Codes).
 1099parse_url_search(Codes, Fields) :-
 1100    is_list(Codes),
 1101    !,
 1102    phrase(search(Fields), Codes).
 1103parse_url_search(Codes, Fields) :-
 1104    must_be(list, Fields),
 1105    phrase(csearch(Fields, []), Codes).
 1106
 1107
 1108                 /*******************************
 1109                 *          FILE URLs           *
 1110                 *******************************/
 1111
 1112%!  file_name_to_url(+File, -URL) is det.
 1113%!  file_name_to_url(-File, +URL) is semidet.
 1114%
 1115%   Translate between a filename and a file:// URL.
 1116%
 1117%   @tbd    Current implementation does not deal with paths that
 1118%           need special encoding.
 1119
 1120file_name_to_url(File, FileURL) :-
 1121    nonvar(File),
 1122    !,
 1123    absolute_file_name(File, Path),
 1124    atom_concat('file://', Path, FileURL),
 1125    !.
 1126file_name_to_url(File, FileURL) :-
 1127    atom_concat('file://', File, FileURL),
 1128    !