From 23f1d484df5687b722e3e4f4a74a9fd62ad526af Mon Sep 17 00:00:00 2001 From: Elara Musayelyan Date: Sun, 16 Jul 2023 23:23:47 -0700 Subject: [PATCH] Implement Callouts --- pcre.go | 135 ++++++++++++++++++++++++++++++++++++++++++++++++--- pcre_test.go | 55 +++++++++++++++++++++ 2 files changed, 183 insertions(+), 7 deletions(-) diff --git a/pcre.go b/pcre.go index 828b036..d9adc92 100644 --- a/pcre.go +++ b/pcre.go @@ -27,7 +27,11 @@ type Regexp struct { mtx *sync.Mutex expr string re uintptr + mctx uintptr tls *libc.TLS + + calloutMtx *sync.Mutex + callout *func(tls *libc.TLS, cbptr, data uintptr) int32 } // Compile runs CompileOpts with no options. @@ -73,10 +77,12 @@ func CompileOpts(pattern string, options CompileOption) (*Regexp, error) { // Create regexp instance regex := Regexp{ - expr: pattern, - mtx: &sync.Mutex{}, - re: r, - tls: tls, + expr: pattern, + mtx: &sync.Mutex{}, + re: r, + mctx: lib.Xpcre2_match_context_create_8(tls, 0), + tls: tls, + calloutMtx: &sync.Mutex{}, } // Make sure resources are freed if GC collects the @@ -298,7 +304,7 @@ func (r *Regexp) FindStringIndex(s string) []int { // FinAllString is the String version of FindAll func (r *Regexp) FindAllString(s string, n int) []string { matches := r.FindAll([]byte(s), n) - + out := make([]string, len(matches)) for index, match := range matches { out[index] = string(match) @@ -483,9 +489,12 @@ func (r *Regexp) ReplaceAllLiteralString(src, repl string) string { // between those expression matches. // // Example: +// // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) // // s: ["", "b", "b", "c", "cadaaae"] +// // The count determines the number of substrings to return: +// // n > 0: at most n substrings; the last substring will be the unsplit remainder. // n == 0: the result is nil (zero substrings) // n < 0: all substrings @@ -556,6 +565,116 @@ func (r *Regexp) SubexpIndex(name string) int { return int(ret) } +type CalloutFlags uint32 + +const ( + CalloutStartMatch = CalloutFlags(lib.DPCRE2_CALLOUT_STARTMATCH) + CalloutBacktrack = CalloutFlags(lib.DPCRE2_CALLOUT_BACKTRACK) +) + +type CalloutBlock struct { + // Version contains the version number of the block format. + // The current version is 2. + Version uint32 + + // CalloutNumber contains the number of the callout, in the range 0-255. + // This is the number that follows "?C". For callouts with string arguments, + // this will always be zero. + CalloutNumber uint32 + + // CaptureTop contains the number of the highest numbered substring + // captured so far plus one. If no substrings have yet been captured, + // CaptureTop will be set to 1. + CaptureTop uint32 + + // CaptureLast contains the number of the last substring that was captured. + CaptureLast uint32 + + // Substrings contains all of the substrings captured so far. + Substrings []string + + Mark string + + // Subject contains the string passed to the match function. + Subject string + + // StartMatch contains the offset within the subject at which the current match attempt started. + StartMatch uint + + // CurrentPosition contains the offset of the current match pointer within the subject. + CurrentPosition uint + + // PatternPosition contains the offset within the pattern string to the next item to be matched. + PatternPosition uint + + // NextItemLength contains the length of the next item to be processed in the pattern string. + NextItemLength uint + + // CalloutStringOffset contains the code unit offset to the start of the callout argument string within the original pattern string. + CalloutStringOffset uint + + // CalloutString is the string for the callout. For numerical callouts, this will always be empty. + CalloutString string + + // CalloutFlags contains the following flags: + // CalloutStartMatch + // This is set for the first callout after the start of matching for each new starting position in the subject. + // CalloutBacktrack + // This is set if there has been a matching backtrack since the previous callout, or since the start of matching if this is the first callout from a pcre2_match() run. + // + // Both bits are set when a backtrack has caused a "bumpalong" to a new starting position in the subject. Output + CalloutFlags CalloutFlags +} + +func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error { + cfn := func(tls *libc.TLS, cbptr, data uintptr) int32 { + ccb := (*lib.Tpcre2_callout_block_8)(unsafe.Pointer(cbptr)) + + cb := &CalloutBlock{ + Version: ccb.Fversion, + CalloutNumber: ccb.Fcallout_number, + CaptureTop: ccb.Fcapture_top, + CaptureLast: ccb.Fcapture_last, + Mark: libc.GoString(ccb.Fmark), + StartMatch: uint(ccb.Fstart_match), + CurrentPosition: uint(ccb.Fcurrent_position), + PatternPosition: uint(ccb.Fpattern_position), + NextItemLength: uint(ccb.Fnext_item_length), + CalloutStringOffset: uint(ccb.Fcallout_string_offset), + CalloutFlags: CalloutFlags(ccb.Fcallout_flags), + } + + subjectBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fsubject)), ccb.Fsubject_length) + cb.Subject = string(subjectBytes) + + calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length) + cb.CalloutString = string(calloutStrBytes) + + ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)[2:] + for i := 0; i < len(ovecSlice); i += 2 { + if i+1 >= len(ovecSlice) { + cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:]) + } else { + cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]]) + } + } + + x := fn(cb) + return x + } + + // Prevent callout functions from being GC'd + r.calloutMtx.Lock() + defer r.calloutMtx.Unlock() + r.callout = &cfn + + ret := lib.Xpcre2_set_callout_8(r.tls, r.mctx, *(*uintptr)(unsafe.Pointer(&cfn)), 0) + if ret < 0 { + return codeToError(r.tls, ret) + } + return nil +} + // replaceBytes replaces the bytes at a given location, and returns a new // offset, based on how much bigger or smaller the slice got after replacement func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) { @@ -577,7 +696,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e if len(b) == 0 { return nil, nil } - + r.mtx.Lock() defer r.mtx.Unlock() @@ -600,7 +719,7 @@ func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, e // While the offset is less than the length of the subject for offset < cSubjectLen { // Execute expression on subject - ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, 0) + ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, r.mctx) if ret < 0 { // If no match found, break if ret == lib.DPCRE2_ERROR_NOMATCH { @@ -670,6 +789,8 @@ func (r *Regexp) Close() error { // Free the compiled code lib.Xpcre2_code_free_8(r.tls, r.re) + // Free the match context + lib.Xpcre2_match_context_free_8(r.tls, r.mctx) // Set regular expression to null r.re = 0 diff --git a/pcre_test.go b/pcre_test.go index 008b32d..f8a9b1c 100644 --- a/pcre_test.go +++ b/pcre_test.go @@ -234,3 +234,58 @@ func TestString(t *testing.T) { t.Errorf("expected %s, got %s", expr, r.String()) } } + +func TestCallout(t *testing.T) { + const expr = `(https?)://([.\w\d]+\.[\w\d]{2,4}[\w\d?&=%/.-]*)(?C2)` + subject := "https://www.elara.ws/" + + r := pcre.MustCompile(expr) + defer r.Close() + + executed := false + r.SetCallout(func(cb *pcre.CalloutBlock) int32 { + executed = true + + if cb.CalloutNumber != 2 { + t.Errorf("[CalloutNumber] expected %d, got %d", 2, cb.CalloutNumber) + } + + if cb.CaptureTop != 3 { + t.Errorf("[CaptureTop] expected %d, got %d", 3, cb.CaptureTop) + } + + if cb.CaptureLast != 2 { + t.Errorf("[CaptureLast] expected %d, got %d", 2, cb.CaptureLast) + } + + if cb.Subject != subject { + t.Errorf("[Subject] expected %q, got %q", subject, cb.Subject) + } + + if cb.StartMatch != 0 { + t.Errorf("[StartMatch] expected %d, got %d", 0, cb.StartMatch) + } + + if cb.CurrentPosition != 21 { + t.Errorf("[CurrentPosition] expected %d, got %d", 0, cb.CurrentPosition) + } + + if cb.PatternPosition != 53 { + t.Errorf("[PatternPosition] expected %d, got %d", 53, cb.PatternPosition) + } + + if cb.NextItemLength != 0 { + t.Errorf("[NextItemLength] expected %d, got %d", 0, cb.NextItemLength) + } + + return 0 + }) + + m := r.MatchString(subject) + + if !executed { + t.Error("expected callout to be executed") + } else if !m { + t.Error("expected regular expression to match the string") + } +}