From 70c76493a8f61395f0ec1cd153044e143fc84a8a Mon Sep 17 00:00:00 2001 From: Tom <0x546f6d@protonmail.com> Date: Mon, 23 Dec 2024 14:54:26 +0300 Subject: [PATCH 1/2] implemented simple discard delimiters inside double quote strings --- src/iterators.zig | 86 ++++++++++++++++++++++++++++++++++++++++------- src/zig-csv.zig | 34 ++++++++++++++----- 2 files changed, 98 insertions(+), 22 deletions(-) diff --git a/src/iterators.zig b/src/iterators.zig index 8160bcb..0b7c481 100644 --- a/src/iterators.zig +++ b/src/iterators.zig @@ -2,6 +2,7 @@ //! [Released under GNU LGPLv3] const std = @import("std"); const TableError = @import("zig-csv.zig").TableError; +const Allocator = std.mem.Allocator; /// A struct for iterating over or fetching rows from a parsed table pub const TableIterator = struct { @@ -10,6 +11,8 @@ pub const TableIterator = struct { delimiter: []const u8, header: []const []const u8, body: []const []const u8, + allocator: Allocator, + check_quote: bool, /// Reset the iterator for the function TableIterator.next pub fn reset(self: *TableIterator) void { @@ -23,6 +26,8 @@ pub const TableIterator = struct { const row = RowIterator{ .header = self.header, .row = std.mem.splitSequence(u8, self.body[self.iterator_index], self.delimiter), + .allocator = self.allocator, + .check_quote = self.check_quote, }; self.iterator_index += 1; @@ -57,6 +62,8 @@ pub const RowIterator = struct { iterator_index: usize = 0, header: []const []const u8, row: std.mem.SplitIterator(u8, .sequence), + allocator: Allocator, + check_quote: bool, /// Reset the iterator for the function RowIterator.next pub fn reset(self: *RowIterator) void { @@ -69,12 +76,18 @@ pub const RowIterator = struct { const value = self.row.next(); if (value == null) return null; - const item = RowItem{ + var item = RowItem{ .column_index = self.iterator_index, .key = self.header[self.iterator_index], .value = value.?, }; + if (self.check_quote and item.value.len > 0 and item.value[0] == '"' and item.value[item.value.len - 1] != '"') { + while (item.value[item.value.len - 1] != '"') { + item.value = std.mem.concat(self.allocator, u8, &[_][]const u8{ item.value, self.row.delimiter, self.row.next().? }) catch item.value; + } + } + self.iterator_index += 1; return item; @@ -85,16 +98,23 @@ pub const RowIterator = struct { var iterator = std.mem.splitSequence(u8, self.row.buffer, self.row.delimiter); var current_column_index: usize = 0; - while (iterator.next()) |value| : (current_column_index += 1) { - if (current_column_index == target_column_index) { - return RowItem{ - .column_index = current_column_index, - .key = self.header[current_column_index], - .value = value, - }; + if (self.check_quote) { + return RowItem{ + .column_index = current_column_index, + .key = self.header[current_column_index], + .value = try getColumnItemInQuote(u8, &iterator, target_column_index, self.allocator), + }; + } else { + while (iterator.next()) |value| : (current_column_index += 1) { + if (current_column_index == target_column_index) { + return RowItem{ + .column_index = current_column_index, + .key = self.header[current_column_index], + .value = value, + }; + } } } - return TableError.IndexNotFound; } }; @@ -114,20 +134,32 @@ pub const ColumnIterator = struct { column_index: usize, delimiter: []const u8, body: []const []const u8, + allocator: Allocator, + check_quote: bool, // Create a ColumnItem from a row fn rowToColumnItem(self: ColumnIterator, row: []const u8) ColumnItem { var item: ColumnItem = undefined; var values = std.mem.splitSequence(u8, row, self.delimiter); - var current_index: usize = 0; - while (values.next()) |value| : (current_index += 1) { - if (current_index == self.column_index) { + if (self.check_quote) { + const value: ?[]const u8 = getColumnItemInQuote(u8, &values, self.column_index, self.allocator) catch null; + if (value != null) { item = ColumnItem{ .row_index = self.iterator_index, - .value = value, + .value = value.?, }; } + } else { + var current_index: usize = 0; + while (values.next()) |value| : (current_index += 1) { + if (current_index == self.column_index) { + item = ColumnItem{ + .row_index = self.iterator_index, + .value = value, + }; + } + } } return item; @@ -160,3 +192,31 @@ pub const ColumnIterator = struct { return item; } }; + +/// Return the value of a column in a row, while discarding delimiters inside "double quotes" +pub fn getColumnItemInQuote(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize, allocator: Allocator) TableError![]const T { + var index: usize = 0; + var tmp_val: ?[]const u8 = null; + + while (split_iterator.next()) |*item| { + var item_value = item.*; + if (item_value.len > 0) { + if (tmp_val == null and item_value[0] == '"' and item_value[item_value.len - 1] != '"') { + tmp_val = item_value; + } else if (tmp_val != null) { + tmp_val = std.mem.concat(allocator, u8, &[_][]const u8{ tmp_val.?, split_iterator.delimiter, item_value }) catch tmp_val; + if (item_value[item_value.len - 1] == '"') { + item_value = tmp_val.?; + tmp_val = null; + } + } + } + if (tmp_val == null) { + if (index == target_index) { + return item_value; + } + index += 1; + } + } + return TableError.IndexNotFound; +} diff --git a/src/zig-csv.zig b/src/zig-csv.zig index b2efe36..9458f1e 100644 --- a/src/zig-csv.zig +++ b/src/zig-csv.zig @@ -7,6 +7,7 @@ const ArrayList = std.ArrayList; const TableIterator = @import("iterators.zig").TableIterator; const RowIterator = @import("iterators.zig").RowIterator; const ColumnIterator = @import("iterators.zig").ColumnIterator; +const getColumnItemInQuote = @import("iterators.zig").getColumnItemInQuote; /// A structure for storing settings for use with struct Table pub const Settings = struct { @@ -14,6 +15,8 @@ pub const Settings = struct { delimiter: []const u8, /// The terminator that defines when a row of delimiter-separated values is terminated terminator: []const u8, + /// The check_quote discards delimiters inside "double quotes" when separating values + check_quote: bool, /// A function that returns the default settings that are most commonly used for CSV data /// { .delimiter = ",", .terminator = "\n" } @@ -21,6 +24,7 @@ pub const Settings = struct { return Settings{ .delimiter = ",", .terminator = "\n", + .check_quote = false, }; } }; @@ -59,16 +63,22 @@ pub const Table = struct { body: std.ArrayListAligned([]const u8, null), // Return the item with the matching index from an iterator struct std.mem.SplitIterator(T) - fn splitIteratorGetIndex(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { - var index: usize = 0; + fn splitIteratorGetIndex(self: Table, comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { + if (self.check_quote) { + return getColumnItemInQuote(u8, split_iterator, target_index, self.allocator); + } else { + var index: usize = 0; - while (split_iterator.next()) |item| : (index += 1) { - if (index == target_index) { - return item; - } - } + if (!self.settings.check_quote) { + while (split_iterator.next()) |item| : (index += 1) { + if (index == target_index) { + return item; + } + } + } else {} - return TableError.IndexNotFound; + return TableError.IndexNotFound; + } } /// Initialize struct Table @@ -125,6 +135,8 @@ pub const Table = struct { .delimiter = self.settings.delimiter, .header = self.header.items, .body = self.body.items, + .allocator = self.allocator, + .check_quote = self.settings.check_quote, }; } @@ -153,7 +165,7 @@ pub const Table = struct { const row_count = std.mem.count(u8, row, self.settings.delimiter) + 1; var row_values = std.mem.splitSequence(u8, row, self.settings.delimiter); if (column_index >= row_count) return TableError.MissingValue; - const value = try Table.splitIteratorGetIndex(u8, &row_values, column_index); + const value = try self.splitIteratorGetIndex(u8, &row_values, column_index); if (std.mem.eql(u8, value, searched_value)) { try row_indexes.append(row_index); @@ -171,6 +183,8 @@ pub const Table = struct { .body = self.body.items, .delimiter = self.settings.delimiter, .column_index = column_index, + .allocator = self.allocator, + .check_quote = self.settings.check_quote, }; } @@ -181,6 +195,8 @@ pub const Table = struct { return RowIterator{ .header = self.header.items, .row = std.mem.splitSequence(u8, self.body.items[row_index], self.settings.delimiter), + .allocator = self.allocator, + .check_quote = self.settings.check_quote, }; } From 1a42dadc954379032ce651da00ad00ea813d395f Mon Sep 17 00:00:00 2001 From: Tom <0x546f6d@protonmail.com> Date: Tue, 28 Jan 2025 09:53:41 +0300 Subject: [PATCH 2/2] fixed memory leak using table.arena; only allocate if value inside double-quote needs to be returned --- src/iterators.zig | 45 ++++++++++++++++++++++++--------------------- src/zig-csv.zig | 21 ++++++++++----------- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/iterators.zig b/src/iterators.zig index 0b7c481..abcda9c 100644 --- a/src/iterators.zig +++ b/src/iterators.zig @@ -42,6 +42,8 @@ pub const TableIterator = struct { return RowIterator{ .header = self.header, .row = std.mem.splitSequence(u8, self.body[row_index], self.delimiter), + .allocator = self.allocator, + .check_quote = self.check_quote, }; } }; @@ -100,8 +102,8 @@ pub const RowIterator = struct { if (self.check_quote) { return RowItem{ - .column_index = current_column_index, - .key = self.header[current_column_index], + .column_index = target_column_index, + .key = self.header[target_column_index], .value = try getColumnItemInQuote(u8, &iterator, target_column_index, self.allocator), }; } else { @@ -194,29 +196,30 @@ pub const ColumnIterator = struct { }; /// Return the value of a column in a row, while discarding delimiters inside "double quotes" -pub fn getColumnItemInQuote(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize, allocator: Allocator) TableError![]const T { +pub fn getColumnItemInQuote(comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize, allocator: std.mem.Allocator) TableError![]const T { var index: usize = 0; - var tmp_val: ?[]const u8 = null; - - while (split_iterator.next()) |*item| { - var item_value = item.*; - if (item_value.len > 0) { - if (tmp_val == null and item_value[0] == '"' and item_value[item_value.len - 1] != '"') { - tmp_val = item_value; - } else if (tmp_val != null) { - tmp_val = std.mem.concat(allocator, u8, &[_][]const u8{ tmp_val.?, split_iterator.delimiter, item_value }) catch tmp_val; - if (item_value[item_value.len - 1] == '"') { - item_value = tmp_val.?; - tmp_val = null; - } - } - } - if (tmp_val == null) { + var in_quote = false; + var item_in_quote: []const u8 = ""; + + while (split_iterator.next()) |item| { + if (!in_quote and item.len > 1 and item[0] == '"' and item[item.len - 1] != '"') { // check if item is the beginning of a double quoted value + in_quote = true; + if (index == target_index) item_in_quote = item; + continue; + } else if (in_quote) { // process item inside double quote + // allocate if item needs to be returned if (index == target_index) { - return item_value; + item_in_quote = try std.mem.concat(allocator, u8, &[_][]const u8{ item_in_quote, split_iterator.delimiter, item }); } - index += 1; + if (item.len == 0 or item[item.len - 1] != '"') continue; + // item is the end of the double quoted value + in_quote = false; } + + // return item value + if (item_in_quote.len > 0) return item_in_quote else if (index == target_index) return item; + index += 1; } + return TableError.IndexNotFound; } diff --git a/src/zig-csv.zig b/src/zig-csv.zig index 9458f1e..9332af4 100644 --- a/src/zig-csv.zig +++ b/src/zig-csv.zig @@ -16,7 +16,7 @@ pub const Settings = struct { /// The terminator that defines when a row of delimiter-separated values is terminated terminator: []const u8, /// The check_quote discards delimiters inside "double quotes" when separating values - check_quote: bool, + check_quote: bool = false, /// A function that returns the default settings that are most commonly used for CSV data /// { .delimiter = ",", .terminator = "\n" } @@ -24,7 +24,6 @@ pub const Settings = struct { return Settings{ .delimiter = ",", .terminator = "\n", - .check_quote = false, }; } }; @@ -63,9 +62,9 @@ pub const Table = struct { body: std.ArrayListAligned([]const u8, null), // Return the item with the matching index from an iterator struct std.mem.SplitIterator(T) - fn splitIteratorGetIndex(self: Table, comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { - if (self.check_quote) { - return getColumnItemInQuote(u8, split_iterator, target_index, self.allocator); + fn splitIteratorGetIndex(self: *Table, comptime T: type, split_iterator: *std.mem.SplitIterator(T, .sequence), target_index: usize) TableError![]const T { + if (self.settings.check_quote) { + return getColumnItemInQuote(u8, split_iterator, target_index, self.arena_allocator.allocator()); } else { var index: usize = 0; @@ -130,12 +129,12 @@ pub const Table = struct { } /// Returns a struct TableIterator containing all rows inside struct Table - pub fn getAllRows(self: Table) TableIterator { + pub fn getAllRows(self: *Table) TableIterator { return TableIterator{ .delimiter = self.settings.delimiter, .header = self.header.items, .body = self.body.items, - .allocator = self.allocator, + .allocator = self.arena_allocator.allocator(), .check_quote = self.settings.check_quote, }; } @@ -156,7 +155,7 @@ pub const Table = struct { } /// Return a slice of row indexes by a provided column index and searched value - pub fn findRowIndexesByValue(self: Table, allocator: Allocator, column_index: usize, searched_value: []const u8) TableError![]usize { + pub fn findRowIndexesByValue(self: *Table, allocator: Allocator, column_index: usize, searched_value: []const u8) TableError![]usize { var row_indexes = ArrayList(usize).init(allocator); if (column_index >= self.header.items.len) return TableError.IndexNotFound; @@ -178,12 +177,12 @@ pub const Table = struct { } /// Returns a struct ColumnIterator, containing all elements of a given column by its index - pub fn getColumnByIndex(self: Table, column_index: usize) ColumnIterator { + pub fn getColumnByIndex(self: *Table, column_index: usize) ColumnIterator { return ColumnIterator{ .body = self.body.items, .delimiter = self.settings.delimiter, .column_index = column_index, - .allocator = self.allocator, + .allocator = self.arena_allocator.allocator(), .check_quote = self.settings.check_quote, }; } @@ -195,7 +194,7 @@ pub const Table = struct { return RowIterator{ .header = self.header.items, .row = std.mem.splitSequence(u8, self.body.items[row_index], self.settings.delimiter), - .allocator = self.allocator, + .allocator = self.arena_allocator.allocator(), .check_quote = self.settings.check_quote, }; }