package urlutil import ( "net/url" "strings" errorutil "github.com/projectdiscovery/utils/errors" stringsutil "github.com/projectdiscovery/utils/strings" ) // ## URL Parsing Methods // Function | Description | Type | Behavior | // -----------------------------------------------------|--------------------------------------------------|-------------------------------|------------------------------------------| // `Parse(inputURL string)` | Standard URL Parsing (+ Some Edgecases) | Both Relative & Absolute URLs | NA | // `ParseURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Both Relative & Absolute URLs | NA | // `ParseRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Relative URLs | error if absolute URL is given | // `ParseRawRelativeURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing | Only Relative URLs | error if absolute URL is given | // `ParseAbsoluteURL(inputURL string, unsafe bool)` | Standard + Unsafe URL Parsing (+ Edgecases) | Only Absolute URLs | error if relative URL is given | // ParseURL (can be relative or absolute) func Parse(inputURL string) (*URL, error) { return ParseURL(inputURL, false) } // Parse and return URL (can be relative or absolute) func ParseURL(inputURL string, unsafe bool) (*URL, error) { u := &URL{ URL: &url.URL{}, Original: inputURL, Unsafe: unsafe, Params: NewOrderedParams(), } var err error u, err = absoluteURLParser(u) if err != nil { return nil, err } if u.IsRelative { return ParseRelativePath(inputURL, unsafe) } // logical bug url is not relative but host is empty if u.Host == "" { return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative") } // # Normalization 1: if value of u.Host does not look like a common domain // it is most likely a relative path parsed as host // this happens because of ambiguity of url.Parse // because // when parsing url like scanme.sh/my/path url.Parse() puts `scanme.sh/my/path` as path and host is empty // to avoid this we always parse url with a schema prefix if it is missing (ex: https:// is not in input url) and then // rule out the possiblity that given url is not a relative path // this handles below edgecase // u , err := url.Parse(`mypath`) if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" { // TODO: should use a proper regex to validate hostname/ip // currently domain names without (.) are not considered as valid and autocorrected // this does not look like a valid domain , ipv4 or ipv6 // consider it as relative // use ParseAbosluteURL to avoid this issue u.IsRelative = true u.Path = inputURL u.Host = "" } return u, nil } // ParseAbsoluteURL parses and returns absolute url // should be preferred over others when input is known to be absolute url // this reduces any normalization and autocorrection related to relative paths // and returns error if input is relative path func ParseAbsoluteURL(inputURL string, unsafe bool) (*URL, error) { u := &URL{ URL: &url.URL{}, Original: inputURL, Unsafe: unsafe, Params: NewOrderedParams(), } var err error u, err = absoluteURLParser(u) if err != nil { return nil, err } if u.IsRelative { return nil, errorutil.NewWithTag("urlutil", "expected absolute url but got relative url input=%v,path=%v", inputURL, u.Path) } if u.URL.Host == "" { return nil, errorutil.NewWithTag("urlutil", "something went wrong got empty host for absolute url=%v", inputURL) } return u, nil } // ParseRelativePath parses and returns relative path // should be preferred over others when input is known to be relative path // this reduces any normalization and autocorrection related to absolute paths // and returns error if input is absolute path func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) { u := &URL{ URL: &url.URL{}, Original: inputURL, Unsafe: unsafe, IsRelative: true, } return relativePathParser(u) } // ParseRelativePath func ParseRawRelativePath(inputURL string, unsafe bool) (*URL, error) { u := &URL{ URL: &url.URL{}, Original: inputURL, Unsafe: unsafe, IsRelative: true, disableAutoCorrect: true, } return relativePathParser(u) } // absoluteURLParser is common absolute parser logic used to avoid duplication of code func absoluteURLParser(u *URL) (*URL, error) { u.fetchParams() // filter out fragments and parameters only then parse path // we use u.Original because u.fetchParams() parses fragments and parameters // from u.Original (this is done to preserve query order in params and other edgecases) if u.Original == "" { return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input") } // Note: we consider //scanme.sh as valid (since all browsers accept this