Hướng dẫn golang html tokenizer example - Galang Hatmya Ví dụ về Tokenize

Question

GO làm cho nó thực sự dễ dàng để xây dựng không chỉ các máy chủ HTTP mà còn cả các máy khách HTTP. Đó là, bạn có thể nhanh chóng viết các chương trình thực hiện các yêu cầu HTTP và xử lý kết quả. Nếu kết quả là HTML, Go cũng làm cho việc chuyển đổi HTML và kiểm tra kết quả đó khá dễ dàng và kiểm tra kết quả.

Nội dung chính Show

Tạo một chương trình đi đơn giản
Tìm nạp URL
Tokenize HTML
Tái cấu trúc mã
Cho nó một vòng xoáy
Mở rộng nó

Tạo một chương trình đi đơn giản

Để chỉ cho bạn cách tất cả hoạt động, hãy tạo một chương trình sẽ tìm nạp một số HTML và trích xuất giá trị của phần tử

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2 của nó.

Tạo một thư mục mới trong thư mục

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

3 của bạn có tên

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

4. Bên trong thư mục đó, hãy tạo một tệp có tên

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

5 và thêm bản thân chương trình GO cơ bản này vào nó:

package main

func main() {

}

Chúng tôi có thể mã hóa một URL ở đây, nhưng chương trình của chúng tôi sẽ linh hoạt hơn nhiều nếu chúng tôi đọc URL từ một đối số dòng lệnh. Chúng có sẵn thông qua lát

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

6. Phần tử đầu tiên trong lát cắt là lệnh được sử dụng để thực hiện chương trình của bạn, vì vậy tham số đầu tiên thực sự là tại

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

7. Nếu người dùng không cung cấp tham số đó, chúng ta nên mô tả cách sử dụng chương trình và thoát bằng mã không khác, cho biết lỗi đã xảy ra:

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

Bạn có thể tự hỏi tại sao tôi đã sử dụng

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

8 cho tên biến ở đây thay vì

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

9. Có hai lý do. Đầu tiên là có một gói thư viện tiêu chuẩn GO có tên

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

9, vì vậy tốt nhất nên tránh sử dụng nó như một tên biến. Nếu bạn cố gắng nhập gói

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

9 sau (giả sử phân tích URL vào các bộ phận thành phần của nó), biến cục bộ của bạn sẽ theo dõi biến được tạo bởi gói nhập khẩu và bạn sẽ không thể truy cập các chức năng của gói trong khi biến cục bộ của bạn vẫn còn trong phạm vi.

Lý do thứ hai là Go rất quan điểm về phong cách mã. Các quy tắc phong cách của họ nói rằng các từ viết tắt trong các tên biến nên ở chữ hoa. Nếu bạn tạo một tên biến

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

2, Go linter sẽ tạo ra một cảnh báo nói rằng

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

2 phải là

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

4.

Tìm nạp URL

Bây giờ chúng ta biết những gì URL để tìm nạp, đã đến lúc phải làm điều đó. Rất may, hãy làm cho điều này vô lý dễ dàng. Gói

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

5 của họ có chức năng

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

6 thực hiện chính xác những gì chúng ta cần:

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

Hàm

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

6 trả về một con trỏ cho cấu trúc http.response và có khả năng là một lỗi. Nếu lỗi không phải là NIL, tốt nhất nên giả sử rằng con trỏ phản hồi cũng sẽ là không. Vì vậy, đó là một mẫu rất phổ biến để kiểm tra xem

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

8 và nếu có, hãy trả lại lỗi cho người gọi. Vì chúng tôi vẫn ở trong chức năng

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

9, chúng tôi chỉ sử dụng

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

0 để in một thông báo lỗi và thoát khỏi quy trình bằng mã lỗi.

Nếu không có lỗi, chúng tôi có phản hồi hợp lệ và biến

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

1 của chúng tôi giờ đây sẽ giữ tất cả thông tin phản hồi: mã trạng thái, tiêu đề và thân nội dung. Cơ thể này là một

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

2, cho chúng ta một gợi ý rằng chúng ta cần phải đóng luồng cơ thể này trước khi thoát khỏi chương trình của chúng ta. Ở đây chúng tôi sử dụng câu lệnh Defer của GO để đảm bảo rằng phương pháp

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

3 được gọi trên thân phản hồi bất kể chúng tôi thoát khỏi chức năng hiện tại như thế nào.

Để chúng tôi xử lý phản hồi này dưới dạng trang HTML, chúng tôi vẫn cần xác minh hai điều trước khi tiến hành:

Mã trạng thái == http.statusok (200)

Tiêu đề

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

4 bắt đầu với

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

5

Nếu người dùng đã cho chúng tôi một URL không khớp với bất cứ thứ gì trên máy chủ, máy chủ sẽ phản hồi với lỗi

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

6 và thân phản hồi sẽ không được sử dụng cho chúng tôi. Và nếu người dùng cung cấp cho chúng tôi một URL cho một cái gì đó như hình ảnh hoặc bảng kiểu CSS, thân phản hồi sẽ không phân tích thành HTML hợp lệ. Chúng tôi có thể xác minh những điều này bằng một vài kiểm tra:

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

Gói

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

7 cung cấp một số chức năng liên quan đến chuỗi tiện dụng. Ở đây chúng tôi sử dụng

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

8, trả về true nếu chuỗi được truyền khi tham số đầu tiên bắt đầu với chuỗi được truyền dưới dạng tham số thứ hai.

Lưu ý: Chúng tôi đang sử dụng
//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}
0 ở đây vì đây là tiện ích dòng lệnh và chúng tôi muốn thoát khỏi quá trình trong trường hợp lỗi. Nếu bạn đang tìm nạp và mã hóa HTML trong máy chủ web, không sử dụng log.fatalf (), vì thoát khỏi quy trình sẽ dừng máy chủ web của bạn. Thay vào đó, hãy xử lý các lỗi bằng cách trả lời khách hàng của bạn bằng hàm
go get golang.org/x/net/html
0. Điều này ghi lỗi cho máy khách và để máy chủ của bạn chạy. we are using
//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}
0 here because this is a command-line utility, and we want to exit the process in the event of an error. If you are fetching and tokenizing HTML in a web server, do not use log.Fatalf(), as exiting the process will stop your web server. Instead, handle errors by responding to your client using the
go get golang.org/x/net/html
0 function. This writes the error to the client and leaves your server running.

Tokenize HTML

Bây giờ chúng tôi đã sẵn sàng để mã hóa HTML phản hồi và trích xuất tiêu đề trang. Nếu bạn không quen thuộc với từ đó 'tokenize', thì đó chỉ đơn giản là quá trình chia một luồng ký tự thành các mã thông báo riêng biệt được xác định bởi ngữ pháp cụ thể trong trường hợp này, HTML. Các mã thông báo trong HTML là thẻ bắt đầu (

go get golang.org/x/net/html

1), thẻ tự đóng (

go get golang.org/x/net/html

2), thẻ kết thúc (

go get golang.org/x/net/html

3) và nội dung văn bản đơn giản trong một phần tử.

Mã thông báo là bước đầu tiên để phân tích tài liệu vào một cây phần tử và các nút văn bản, như DOM. Cây đầy đủ đó thực sự hữu ích cho một chương trình như trình duyệt web, cho phép thao tác và tái tạo lại cây theo thời gian, nhưng nó phải trả giá: nó tiêu tốn rất nhiều bộ nhớ để xây dựng tất cả các nút cây đó và chúng tôi Phải xử lý toàn bộ tài liệu, ngay cả khi tất cả những gì chúng tôi muốn là một yếu tố trong phần

go get golang.org/x/net/html

4. Sẽ hiệu quả hơn nhiều khi đơn giản là token hóa trang, tìm kiếm phần tử

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2 nói riêng và thoát ngay khi chúng tôi tìm thấy nó.

HTML Tokenizing và Parsing chưa hoàn toàn trong Thư viện GO Standard, nhưng nhóm GO đã phát hành một "gói bổ sung" cho việc này. Các gói bổ sung này thường trở thành một phần của thư viện tiêu chuẩn trong một bản phát hành trong tương lai (ví dụ: gói

go get golang.org/x/net/html

6 bắt đầu như một gói bổ sung), nhưng chỉ sau khi nhóm GO hài lòng với API. Vì họ đảm bảo khả năng tương thích ngược mã nguồn với mỗi bản phát hành, họ chỉ thêm những thứ vào thư viện tiêu chuẩn khi họ chắc chắn rằng họ có thể đóng băng API.

Nhưng các nhà phát triển có thể bắt đầu sử dụng các gói bổ sung ngay lập tức; Nhóm GO thực sự khuyến khích các nhà phát triển làm như vậy và gửi cho họ phản hồi. Để có được gói phân tích mã thông báo và phân tích mã thông báo HTML bổ sung này, hãy sử dụng lệnh này trong thiết bị đầu cuối dòng lệnh của bạn:

go get golang.org/x/net/html

Điều này sẽ tải mã nguồn cho gói vào

go get golang.org/x/net/html

7 của bạn, biên dịch nó và cài đặt tệp đối tượng kết quả vào thư mục

go get golang.org/x/net/html

8 của bạn.

Để sử dụng gói này trong dự án của chúng tôi, chúng tôi cần nhập nó. Thêm đường dẫn nhập của gói vào danh sách

go get golang.org/x/net/html

9 của bạn:

import (
    //...existing imports...

    "golang.org/x/net/html"
)

Bởi vì đường dẫn nhập là giống như bạn chuyển sang

import (
    //...existing imports...

    "golang.org/x/net/html"
)

0, các phụ thuộc của bạn hoàn toàn tự mô tả. Nếu các nhà phát triển khác sao chép repo của bạn, tất cả những gì họ cần làm là thực thi

import (
    //...existing imports...

    "golang.org/x/net/html"
)

0 mà không có đối số và nó sẽ tự động tải xuống và cài đặt tất cả các gói đã nhập.

Bắt đầu bằng cách tạo một mã thông báo mới trên thân phản hồi:

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

Nếu bạn nhìn vào tài liệu gói, chức năng

import (
    //...existing imports...

    "golang.org/x/net/html"
)

2 thực sự có giao diện

import (
    //...existing imports...

    "golang.org/x/net/html"
)

3. Điều này cho phép trình phân tích cú pháp hoạt động trên bất kỳ luồng HTML nào thực hiện giao diện IO.Reader rất tối thiểu. Ví dụ: nó có thể mã hóa HTML được trả về từ yêu cầu HTTP hoặc đọc từ tệp hoặc tìm nạp từ cơ sở dữ liệu: bất kỳ nguồn nào thỏa mãn giao diện

import (
    //...existing imports...

    "golang.org/x/net/html"
)

3.

Tokenizer có phương thức

import (
    //...existing imports...

    "golang.org/x/net/html"
)

5 trả về loại mã thông báo tiếp theo. Bạn nên gọi cái này trong một vòng lặp chỉ thoát khi bạn tìm thấy phần tử

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2 hoặc sau khi bạn gặp lỗi (bao gồm cả kết thúc của luồng). Vòng lặp

import (
    //...existing imports...

    "golang.org/x/net/html"
)

7 của GO xử lý điều này bằng cách bỏ qua các biểu thức bắt đầu, so sánh và lặp lại:

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

import (
    //...existing imports...

    "golang.org/x/net/html"
)

8 trở lại từ

import (
    //...existing imports...

    "golang.org/x/net/html"
)

9 sẽ là một trong những hằng số loại mã thông báo của họ. Ở đây chúng tôi kiểm tra xem đó là mã thông báo lỗi và xử lý lỗi. Các mã thông báo lỗi xảy ra vì chúng tôi đã đạt đến cuối luồng đầu vào hoặc vì có một lỗi thực sự là HTML. Nếu trường hợp trước, chúng ta chỉ thoát ra khỏi vòng lặp, nhưng trong trường hợp sau, chúng ta cần báo cáo lỗi và thoát. Vì đây chỉ là một tiện ích dòng lệnh đơn giản, chúng tôi có thể sử dụng

//check response status code
if resp.StatusCode != http.StatusOK {
    log.Fatalf("response status code was %d\n", resp.StatusCode)
}

//check response content type
ctype := resp.Header.Get("Content-Type")
if !strings.HasPrefix(ctype, "text/html") {
    log.Fatalf("response content type was %s not text/html\n", ctype)
}

0 để làm điều đó. Trong một máy chủ web hoặc quy trình chạy dài khác, bạn nên báo cáo lỗi theo một cách khác và tiếp tục chạy.

Nếu mã thông báo không phải là mã thông báo lỗi, loại sẽ là một trong những điều sau đây:

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

1: Thẻ bắt đầu như

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

3: Thẻ kết thúc như

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

4

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

5: Thẻ tự đóng như

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

6

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

7: Nội dung văn bản trong thẻ

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

8: Nhận xét HTML như

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

9

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

0: Một khai báo loại tài liệu như

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

1

Vì chúng tôi theo sau nội dung của mã thông báo

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2, chúng tôi cần tìm kiếm một

//create a new tokenizer over the response body
tokenizer := html.NewTokenizer(resp.Body)

1 và trong trường hợp này, chúng tôi có thể bỏ qua các loại khác một cách an toàn.

//...existing looping and
//error-checking code from above...

//if this is a start tag token...
if tokenType == html.StartTagToken {
    //get the token
    token := tokenizer.Token()
    //if the name of the element is "title"
    if "title" == token.Data {
        //the next token should be the page title
        tokenType = tokenizer.Next()
        //just make sure it's actually a text token
        if tokenType == html.TextToken {
            //report the page title and break out of the loop
            fmt.Println(tokenizer.Token().Data)
            break
        }
    }
}

Phương thức

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

4 xây dựng và trả về cấu trúc

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

5 đông dân cư với thông tin về mã thông báo. Đối với mã thông báo bắt đầu, trường

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

6 chứa tên của thẻ. Tên thẻ đã được chuyển đổi thành chữ thường bằng tokenizer, vì vậy chúng tôi chỉ có thể so sánh nó với

//loop until we find the title element and its content
//or encounter an error (which includes the end of the stream)
for {
    //get the next token type
    tokenType := tokenizer.Next()

    //if it's an error token, we either reached
    //the end of the file, or the HTML was malformed
    if tokenType == html.ErrorToken {
        err := tokenizer.Err()
        if err == io.EOF {
            //end of the file, break out of the loop
            break
        }
        //otherwise, there was an error tokenizing,
        //which likely means the HTML was malformed.
        //since this is a simple command-line utility,
        //we can just use log.Fatalf() to report the error
        //and exit the process with a non-zero status code
        log.Fatalf("error tokenizing HTML: %v", tokenizer.Err())
    }

    //process the token according to the token type...
}

7. Nếu đó là phần tử tiêu đề, thì chúng ta có thể đọc mã thông báo tiếp theo, đó là nội dung văn bản của phần tử.

Vì tiêu đề trang là thứ duy nhất chúng ta theo đuổi, chúng ta có thể thoát ra khỏi vòng lặp ngay khi nhận được nó. Nếu bạn cần tiếp tục xử lý các thẻ khác, đừng phá vỡ, nhưng trong trường hợp này, chúng tôi có thể vì tiêu đề là tất cả những gì chúng tôi cần. Vì đây là mã cuối cùng trong hàm

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

9, chương trình của chúng tôi thoát ra sau khi chúng tôi thoát ra khỏi vòng lặp.

Tái cấu trúc mã

Điều này hoạt động, nhưng tất cả các mã nằm trong hàm

//GET the URL
resp, err := http.Get(URL)

//if there was an error, report it and exit
if err != nil {
    //.Fatalf() prints the error and exits the process
    log.Fatalf("error fetching URL: %v\n", err)
}

//make sure the response body gets closed
defer resp.Body.Close()

9, điều này sẽ gây khó khăn cho việc thêm các tính năng hoặc xây dựng dựa trên điều này.Là một bài tập cho người đọc, hãy tái cấu trúc mã thành một số chức năng như sau:

//fetchHTML fetches the provided URL and returns the response body or an error
func fetchHTML(URL string) (io.ReadCloser, error) {
    //TODO: put HTTP get code and error checking here
    //return errors if GET fails, if response status code
    //is != 200, or if Content-Type is not HTML
}

//extractTitle returns the content within the <title> element
//or an error
func extractTitle(body io.ReadCloser) (string, error) {
    //TODO: put the tokenization code here, and return 
    //the page title or an error
}

//fetchTitle fetches the page title for a URL
func fetchTitle(URL string) (string, error) {
    //TODO: fetch the HTML, extract the title, and make sure the body gets closed
}

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    title, err := fetchTitle(os.Args[1])
    if err != nil {
        log.Fatalf("error fetching page title: %v\n", err)
    }

    //print the title
    fmt.Println(title)
}

Bằng cách chia mã thành các chức năng tái sử dụng riêng biệt, chúng ta có thể dễ dàng thêm nhiều tính năng hơn trong tương lai, chẳng hạn như xử lý nhiều URL tại một thời điểm.

Cho nó một vòng xoáy

Để chạy chương trình này, hãy thực hiện các lệnh này từ trong thư mục

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

4 (người dùng Windows nên chạy

//...existing looping and
//error-checking code from above...

//if this is a start tag token...
if tokenType == html.StartTagToken {
    //get the token
    token := tokenizer.Token()
    //if the name of the element is "title"
    if "title" == token.Data {
        //the next token should be the page title
        tokenType = tokenizer.Next()
        //just make sure it's actually a text token
        if tokenType == html.TextToken {
            //report the page title and break out of the loop
            fmt.Println(tokenizer.Token().Data)
            break
        }
    }
}

1):

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

0

Bạn sẽ lấy lại giá trị của phần tử

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

2 trên trang chủ Google.

Bạn có thể kết hợp những thứ này trong vỏ bash thành một dòng, giúp hợp đồng lại và chạy lại sau khi thực hiện các thay đổi:

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

1

Mở rộng nó

Một bạn có nó hoạt động, thử mở rộng điều này để xử lý nhiều URL được truyền dưới dạng các đối số dòng lệnh riêng biệt.Lặp lại trên lát

package main

import (
    "fmt"
    "os"
)

func main() {
    //if the caller didn't provide a URL to fetch...
    if len(os.Args) < 2 {
        //print the usage and exit with an error
        fmt.Printf("usage:\n  pagetitle <url>\n")
        os.Exit(1)
    }

    URL := os.Args[1]
}

6 bắt đầu từ phần tử

//...existing looping and
//error-checking code from above...

//if this is a start tag token...
if tokenType == html.StartTagToken {
    //get the token
    token := tokenizer.Token()
    //if the name of the element is "title"
    if "title" == token.Data {
        //the next token should be the page title
        tokenType = tokenizer.Next()
        //just make sure it's actually a text token
        if tokenType == html.TextToken {
            //report the page title and break out of the loop
            fmt.Println(tokenizer.Token().Data)
            break
        }
    }
}

4, chuyển từng đối số dòng lệnh cho hàm

//...existing looping and
//error-checking code from above...

//if this is a start tag token...
if tokenType == html.StartTagToken {
    //get the token
    token := tokenizer.Token()
    //if the name of the element is "title"
    if "title" == token.Data {
        //the next token should be the page title
        tokenType = tokenizer.Next()
        //just make sure it's actually a text token
        if tokenType == html.TextToken {
            //report the page title and break out of the loop
            fmt.Println(tokenizer.Token().Data)
            break
        }
    }
}