diff --git a/.github/workflows/BundleStaticLibs.yml b/.github/workflows/BundleStaticLibs.yml index e9e4e6690d3b..3fbe60961d93 100644 --- a/.github/workflows/BundleStaticLibs.yml +++ b/.github/workflows/BundleStaticLibs.yml @@ -80,7 +80,8 @@ jobs: - name: Bundle static library shell: bash - run: make bundle-library-o + run: | + make gather-libs - name: Print platform shell: bash @@ -93,14 +94,14 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DUCKDB_STAGING_KEY }} run: | python3 scripts/amalgamation.py - zip -j static-lib-osx-${{ matrix.architecture }}.zip src/include/duckdb.h build/release/libduckdb_bundle.a - ./scripts/upload-assets-to-staging.sh github_release static-lib-osx-${{ matrix.architecture }}.zip + zip -r -j static-libs-osx-${{ matrix.architecture }}.zip src/include/duckdb.h build/release/libs/ + ./scripts/upload-assets-to-staging.sh github_release static-libs-osx-${{ matrix.architecture }}.zip - uses: actions/upload-artifact@v4 with: - name: duckdb-static-lib-osx-${{ matrix.architecture }} + name: duckdb-static-libs-osx-${{ matrix.architecture }} path: | - static-lib-osx-${{ matrix.architecture }}.zip + static-libs-osx-${{ matrix.architecture }}.zip bundle-mingw-static-lib: name: Windows MingW static libs @@ -142,7 +143,7 @@ jobs: - name: Bundle static library shell: bash run: | - make bundle-library-obj + make gather-libs - name: Deploy shell: bash @@ -150,14 +151,14 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.S3_DUCKDB_STAGING_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DUCKDB_STAGING_KEY }} run: | - zip -j static-lib-windows-mingw.zip src/include/duckdb.h build/release/libduckdb_bundle.a - ./scripts/upload-assets-to-staging.sh github_release static-lib-windows-mingw.zip + zip -r -j static-libs-windows-mingw.zip src/include/duckdb.h build/release/libs/ + ./scripts/upload-assets-to-staging.sh github_release static-libs-windows-mingw.zip - uses: actions/upload-artifact@v4 with: - name: duckdb-static-lib-windows-mingw + name: duckdb-static-libs-windows-mingw path: | - static-lib-windows-mingw.zip + static-libs-windows-mingw.zip bundle-linux-arm64-static-libs: name: Linux arm64 static libs runs-on: ubuntu-latest @@ -183,7 +184,7 @@ jobs: -e FORCE_WARN_UNUSED=1 \ -e DUCKDB_PLATFORM=linux_arm64 \ ubuntu:18.04 \ - bash -c "/duckdb/scripts/setup_ubuntu1804.sh && git config --global --add safe.directory /duckdb && make bundle-library -C /duckdb" + bash -c "/duckdb/scripts/setup_ubuntu1804.sh && git config --global --add safe.directory /duckdb && make gather-libs -C /duckdb" - name: Deploy shell: bash env: @@ -191,13 +192,13 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DUCKDB_STAGING_KEY }} run: | python3 scripts/amalgamation.py - zip -j static-lib-linux-arm64.zip src/include/duckdb.h build/release/libduckdb_bundle.a - ./scripts/upload-assets-to-staging.sh github_release static-lib-linux-arm64.zip + zip -r -j static-libs-linux-arm64.zip src/include/duckdb.h build/release/libs/ + ./scripts/upload-assets-to-staging.sh github_release static-libs-linux-arm64.zip - uses: actions/upload-artifact@v4 with: - name: duckdb-static-lib-linux-arm64 + name: duckdb-static-libs-linux-arm64 path: | - static-lib-linux-arm64.zip + static-libs-linux-arm64.zip bundle-linux-amd64-static-libs: name: Linux amd64 static libs runs-on: ubuntu-latest @@ -225,7 +226,7 @@ jobs: -e BUILD_BENCHMARK=1 \ -e FORCE_WARN_UNUSED=1 \ quay.io/pypa/manylinux2014_x86_64 \ - bash -c "yum install -y perl-IPC-Cmd && git config --global --add safe.directory $PWD && make bundle-library -C $PWD" + bash -c "yum install -y perl-IPC-Cmd && git config --global --add safe.directory $PWD && make gather-libs -C $PWD" - name: Print platform shell: bash run: ./build/release/duckdb -c "PRAGMA platform;" @@ -237,10 +238,10 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DUCKDB_STAGING_KEY }} run: | python3 scripts/amalgamation.py - zip -j static-lib-linux-amd64.zip src/include/duckdb.h build/release/libduckdb_bundle.a - ./scripts/upload-assets-to-staging.sh github_release static-lib-linux-amd64.zip + zip -r -j static-libs-linux-amd64.zip src/include/duckdb.h build/release/libs/ + ./scripts/upload-assets-to-staging.sh github_release static-libs-linux-amd64.zip - uses: actions/upload-artifact@v4 with: - name: duckdb-static-lib-linux-amd64 + name: duckdb-static-libs-linux-amd64 path: | - static-lib-linux-amd64.zip \ No newline at end of file + static-libs-linux-amd64.zip \ No newline at end of file diff --git a/Makefile b/Makefile index 38ee1c99dab5..b17c7e6ff66b 100644 --- a/Makefile +++ b/Makefile @@ -524,3 +524,11 @@ bundle-library-obj: bundle-setup bundle-library: release make bundle-library-o + +gather-libs: release + cd build/release && \ + rm -rf libs && \ + mkdir -p libs && \ + cp src/libduckdb_static.a libs/. && \ + cp third_party/*/libduckdb_*.a libs/. && \ + cp extension/*/lib*_extension.a libs/. \ No newline at end of file diff --git a/data/csv/afl/4086/case_1.csv b/data/csv/afl/4086/case_1.csv new file mode 100644 index 000000000000..0b404fbd4d54 --- /dev/null +++ b/data/csv/afl/4086/case_1.csv @@ -0,0 +1,2 @@ +Ë +Ì \ No newline at end of file diff --git a/data/csv/afl/4086/case_2.csv b/data/csv/afl/4086/case_2.csv new file mode 100644 index 000000000000..c9bf78e836f7 --- /dev/null +++ b/data/csv/afl/4086/case_2.csv @@ -0,0 +1,149 @@ + +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +j +k +l + +Ë +Ì + +F +G +H +I +J +K +L +M +N +O +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ôw +x +y +z +{ +| +} +~ +Ÿ +¡ +¢ +£ +¤ +¥ +¦ +§ +¨ +© +ª +« +¬ +­ +® +¯ +° +± +² +³ +´ +µ +¶ +· +¸ +¹ +º +» +¼ +½ +¾ +¿ +À +Á +Â +Ã +Ä +Å +ÿ}}}}}}}}}}}}}}}}}}}}}} +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ô +2 +3 +4 +5 +Õ +Ö +× +Ø +Ù +Ú +Û +Ü +Ý +Þ +ß +à +á +â +ã +ä +å +æ +ç +è +é \ No newline at end of file diff --git a/data/csv/afl/4086/case_3.csv b/data/csv/afl/4086/case_3.csv new file mode 100644 index 000000000000..501156bee996 Binary files /dev/null and b/data/csv/afl/4086/case_3.csv differ diff --git a/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp b/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp index a330c0a4bbef..b6a088786b7e 100644 --- a/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp +++ b/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp @@ -8,17 +8,13 @@ #pragma once -#include "duckdb/common/sort/sort.hpp" #include "duckdb/common/types/column/column_data_collection.hpp" -#include "duckdb/common/types/row/row_layout.hpp" #include "core_functions/aggregate/quantile_helpers.hpp" -#include "duckdb/execution/merge_sort_tree.hpp" #include "duckdb/common/operator/cast_operators.hpp" #include "duckdb/common/operator/multiply.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/function/window/window_index_tree.hpp" #include -#include #include #include @@ -89,7 +85,7 @@ struct QuantileDirect { using RESULT_TYPE = T; inline const INPUT_TYPE &operator()(const INPUT_TYPE &x) const { - return x; + return x; // NOLINT } }; @@ -365,7 +361,7 @@ struct QuantileSortTree { } inline idx_t SelectNth(const SubFrames &frames, size_t n) const { - return index_tree->SelectNth(frames, n); + return index_tree->SelectNth(frames, n).first; } template diff --git a/extension/core_functions/include/core_functions/aggregate/quantile_state.hpp b/extension/core_functions/include/core_functions/aggregate/quantile_state.hpp index 00f4baf77735..cdf242ae9c9c 100644 --- a/extension/core_functions/include/core_functions/aggregate/quantile_state.hpp +++ b/extension/core_functions/include/core_functions/aggregate/quantile_state.hpp @@ -207,6 +207,9 @@ struct WindowQuantileState { dest[0] = skips[0].second; if (skips.size() > 1) { dest[1] = skips[1].second; + } else { + // Avoid UMA + dest[1] = skips[0].second; } return interp.template Extract(dest.data(), result); } catch (const duckdb_skiplistlib::skip_list::IndexError &idx_err) { diff --git a/scripts/generate_extensions_function.py b/scripts/generate_extensions_function.py index da2181121abe..d6e861bfc6e9 100644 --- a/scripts/generate_extensions_function.py +++ b/scripts/generate_extensions_function.py @@ -742,6 +742,7 @@ def write_header(data: ExtensionData): }; // EXTENSION_SECRET_PROVIDERS static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = { + "avro", "aws", "azure", "autocomplete", diff --git a/src/common/types/row/tuple_data_segment.cpp b/src/common/types/row/tuple_data_segment.cpp index d14c0e0bad4c..c3383f3cb26f 100644 --- a/src/common/types/row/tuple_data_segment.cpp +++ b/src/common/types/row/tuple_data_segment.cpp @@ -15,23 +15,23 @@ void TupleDataChunkPart::SetHeapEmpty() { base_heap_ptr = nullptr; } -void SwapTupleDataChunkPart(TupleDataChunkPart &a, TupleDataChunkPart &b) { - std::swap(a.row_block_index, b.row_block_index); - std::swap(a.row_block_offset, b.row_block_offset); - std::swap(a.heap_block_index, b.heap_block_index); - std::swap(a.heap_block_offset, b.heap_block_offset); - std::swap(a.base_heap_ptr, b.base_heap_ptr); - std::swap(a.total_heap_size, b.total_heap_size); - std::swap(a.count, b.count); +void MoveTupleDataChunkPart(TupleDataChunkPart &a, TupleDataChunkPart &b) { + a.row_block_index = b.row_block_index; + a.row_block_offset = b.row_block_offset; + a.heap_block_index = b.heap_block_index; + a.heap_block_offset = b.heap_block_offset; + a.base_heap_ptr = b.base_heap_ptr; + a.total_heap_size = b.total_heap_size; + a.count = b.count; std::swap(a.lock, b.lock); } TupleDataChunkPart::TupleDataChunkPart(TupleDataChunkPart &&other) noexcept : lock((other.lock)) { - SwapTupleDataChunkPart(*this, other); + MoveTupleDataChunkPart(*this, other); } TupleDataChunkPart &TupleDataChunkPart::operator=(TupleDataChunkPart &&other) noexcept { - SwapTupleDataChunkPart(*this, other); + MoveTupleDataChunkPart(*this, other); return *this; } diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 476e4f4bb468..386c5552cf76 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -63,9 +63,9 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m "Mismatch between the number of columns (%d) in the CSV file and what is expected in the scanner (%d).", number_of_columns, csv_file_scan->file_types.size()); } - bool icu_loaded = csv_file_scan->buffer_manager->context.db->ExtensionIsLoaded("icu"); + icu_loaded = csv_file_scan->buffer_manager->context.db->ExtensionIsLoaded("icu"); for (idx_t i = 0; i < csv_file_scan->file_types.size(); i++) { - auto &type = csv_file_scan->file_types[i]; + auto type = csv_file_scan->file_types[i]; if (type.IsJSONType()) { type = LogicalType::VARCHAR; } @@ -436,6 +436,8 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size } // If we got here, we are ignoring errors, hence we must ignore this line. current_errors.Insert(INVALID_UNICODE, cur_col_id, chunk_col_id, last_position); + static_cast(vector_ptr[chunk_col_id])[number_of_rows] = StringVector::AddStringOrBlob( + parse_chunk.data[chunk_col_id], string_t(value_ptr, UnsafeNumericCast(0))); break; } if (allocate) { @@ -606,7 +608,7 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_position) { - bool first_nl; + bool first_nl = false; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles, PrintErrorLine()); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); if (current_line_position.begin == error_position) { @@ -673,6 +675,9 @@ bool LineError::HandleErrors(StringValueResult &result) { result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl), line_pos.GetGlobalPosition(result.requested_size), result.path); } + if (!StringValueScanner::CanDirectlyCast(result.csv_file_scan->file_types[col_idx], result.icu_loaded)) { + result.number_of_rows--; + } break; } case UNTERMINATED_QUOTES: @@ -690,7 +695,7 @@ bool LineError::HandleErrors(StringValueResult &result) { break; case CAST_ERROR: { string column_name; - LogicalTypeId type_id; + LogicalTypeId type_id = LogicalTypeId::INVALID; if (cur_error.col_idx < result.names.size()) { column_name = result.names[cur_error.col_idx]; } @@ -768,7 +773,7 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() const { string FullLinePosition::ReconstructCurrentLine(bool &first_char_nl, unordered_map> &buffer_handles, bool reconstruct_line) const { - if (!reconstruct_line) { + if (!reconstruct_line || begin == end) { return {}; } string result; @@ -822,6 +827,8 @@ bool StringValueResult::AddRowInternal() { } if (current_errors.HandleErrors(*this)) { + D_ASSERT(buffer_handles.find(current_line_position.begin.buffer_idx) != buffer_handles.end()); + D_ASSERT(buffer_handles.find(current_line_position.end.buffer_idx) != buffer_handles.end()); line_positions_per_row[static_cast(number_of_rows)] = current_line_position; number_of_rows++; if (static_cast(number_of_rows) >= result_size) { @@ -881,6 +888,8 @@ bool StringValueResult::AddRowInternal() { RemoveLastLine(); } } + D_ASSERT(buffer_handles.find(current_line_position.begin.buffer_idx) != buffer_handles.end()); + D_ASSERT(buffer_handles.find(current_line_position.end.buffer_idx) != buffer_handles.end()); line_positions_per_row[static_cast(number_of_rows)] = current_line_position; cur_col_id = 0; chunk_col_id = 0; @@ -1024,6 +1033,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { auto &process_result = ParseChunk(); // First Get Parsed Chunk auto &parse_chunk = process_result.ToChunk(); + insert_chunk.Reset(); // We have to check if we got to error error_handler->ErrorIfNeeded(); if (parse_chunk.size() == 0) { @@ -1086,7 +1096,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { if (!state_machine->options.IgnoreErrors()) { LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); - bool first_nl; + bool first_nl = false; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine( first_nl, result.buffer_handles, result.PrintErrorLine()); std::ostringstream error; @@ -1094,11 +1104,15 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { << type.ToString() << "\'"; string error_msg = error.str(); SanitizeError(error_msg); + idx_t row_byte_pos = 0; + if (!(result.line_positions_per_row[line_error].begin == + result.line_positions_per_row[line_error].end)) { + row_byte_pos = result.line_positions_per_row[line_error].begin.GetGlobalPosition( + result.result_size, first_nl); + } auto csv_error = CSVError::CastError( state_machine->options, names[col_idx], error_msg, col_idx, borked_line, lines_per_batch, - result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, - first_nl), - optional_idx::Invalid(), result_vector.GetType().id(), result.path); + row_byte_pos, optional_idx::Invalid(), result_vector.GetType().id(), result.path); error_handler->Error(csv_error); } } @@ -1116,7 +1130,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { if (!state_machine->options.IgnoreErrors()) { LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); - bool first_nl; + bool first_nl = false; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine( first_nl, result.buffer_handles, result.PrintErrorLine()); std::ostringstream error; @@ -1148,6 +1162,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } // Now we slice the result insert_chunk.Slice(successful_rows, sel_idx); + result.borked_rows.clear(); } if (insert_chunk.size() == 0 && cur_buffer_handle) { idx_t to_pos; @@ -1697,13 +1712,14 @@ bool StringValueScanner::CanDirectlyCast(const LogicalType &type, bool icu_loade case LogicalTypeId::TIMESTAMP: case LogicalTypeId::TIME: case LogicalTypeId::DECIMAL: - case LogicalType::VARCHAR: case LogicalType::BOOLEAN: return true; case LogicalType::TIMESTAMP_TZ: // We only try to do direct cast of timestamp tz if the ICU extension is not loaded, otherwise, it needs to go // through string -> timestamp_tz casting return !icu_loaded; + case LogicalType::VARCHAR: + return !type.IsJSONType(); default: return false; } @@ -1884,7 +1900,6 @@ void StringValueScanner::FinalizeChunkProcess() { if (result.current_errors.HandleErrors(result)) { result.number_of_rows++; } - if (states.IsQuotedCurrent() && !found_error && state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) { // If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp index 431b5ab6feca..332937d5bc55 100644 --- a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +++ b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp @@ -58,14 +58,19 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op } const auto delimiter_value = state_machine_options.delimiter.GetValue(); - const auto delimiter_first_byte = static_cast(delimiter_value[0]); + uint8_t delimiter_first_byte; + if (!delimiter_value.empty()) { + delimiter_first_byte = static_cast(delimiter_value[0]); + } else { + delimiter_first_byte = static_cast('\0'); + } const auto quote = static_cast(state_machine_options.quote.GetValue()); const auto escape = static_cast(state_machine_options.escape.GetValue()); const auto comment = static_cast(state_machine_options.comment.GetValue()); const auto new_line_id = state_machine_options.new_line.GetValue(); - const bool multi_byte_delimiter = delimiter_value.size() != 1; + const bool multi_byte_delimiter = delimiter_value.size() > 1; const bool enable_unquoted_escape = state_machine_options.strict_mode.GetValue() == false && state_machine_options.quote != state_machine_options.escape && @@ -149,7 +154,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op transition_array[static_cast(delimiter_value[1])] [static_cast(CSVState::DELIMITER_FIRST_BYTE)] = CSVState::DELIMITER; } else if (delimiter_value.size() == 3) { - if (delimiter_value[0] == delimiter_value[1]) { + if (delimiter_first_byte == delimiter_value[1]) { transition_array[static_cast(delimiter_value[1])] [static_cast(CSVState::DELIMITER_SECOND_BYTE)] = CSVState::DELIMITER_SECOND_BYTE; } @@ -158,11 +163,11 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op transition_array[static_cast(delimiter_value[2])] [static_cast(CSVState::DELIMITER_SECOND_BYTE)] = CSVState::DELIMITER; } else if (delimiter_value.size() == 4) { - if (delimiter_value[0] == delimiter_value[2]) { + if (delimiter_first_byte == delimiter_value[2]) { transition_array[static_cast(delimiter_value[1])] [static_cast(CSVState::DELIMITER_THIRD_BYTE)] = CSVState::DELIMITER_SECOND_BYTE; } - if (delimiter_value[0] == delimiter_value[1] && delimiter_value[1] == delimiter_value[2]) { + if (delimiter_first_byte == delimiter_value[1] && delimiter_value[1] == delimiter_value[2]) { transition_array[static_cast(delimiter_value[1])] [static_cast(CSVState::DELIMITER_THIRD_BYTE)] = CSVState::DELIMITER_THIRD_BYTE; } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index f0d9a8cf735b..fbc1f8cd5e99 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -20,19 +20,32 @@ CSVErrorHandler::CSVErrorHandler(bool ignore_errors_p) : ignore_errors(ignore_er } void CSVErrorHandler::ThrowError(const CSVError &csv_error) { + auto error_to_throw = csv_error; + idx_t error_to_throw_row = GetLineInternal(error_to_throw.error_info); + if (PrintLineNumber(error_to_throw) && !errors.empty()) { + // We stored a previous error here, we pick the one that happens the earliest to throw + for (const auto &error : errors) { + if (CanGetLine(error.GetBoundaryIndex())) { + idx_t cur_error_to_throw = GetLineInternal(error.error_info); + if (cur_error_to_throw < error_to_throw_row) { + error_to_throw = error; + error_to_throw_row = cur_error_to_throw; + } + } + } + } std::ostringstream error; - if (PrintLineNumber(csv_error)) { - error << "CSV Error on Line: " << GetLineInternal(csv_error.error_info) << '\n'; - if (!csv_error.csv_row.empty()) { - error << "Original Line: " << csv_error.csv_row << '\n'; + if (PrintLineNumber(error_to_throw)) { + error << "CSV Error on Line: " << error_to_throw_row << '\n'; + if (!error_to_throw.csv_row.empty()) { + error << "Original Line: " << error_to_throw.csv_row << '\n'; } } - if (csv_error.full_error_message.empty()) { - error << csv_error.error_message; + if (error_to_throw.full_error_message.empty()) { + error << error_to_throw.error_message; } else { - error << csv_error.full_error_message; + error << error_to_throw.full_error_message; } - switch (csv_error.type) { case CAST_ERROR: throw ConversionException(error.str()); diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 8fe70d0ac1db..70fec93ab1ec 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -125,9 +125,6 @@ void CSVReaderOptions::SetDelimiter(const string &input) { if (delim_str.size() > 4) { throw InvalidInputException("The delimiter option cannot exceed a size of 4 bytes."); } - if (input.empty()) { - delim_str = string("\0", 1); - } this->dialect_options.state_machine_options.delimiter.Set(delim_str); } diff --git a/src/execution/operator/persistent/physical_export.cpp b/src/execution/operator/persistent/physical_export.cpp index 66afb4c97498..45e449aab64d 100644 --- a/src/execution/operator/persistent/physical_export.cpp +++ b/src/execution/operator/persistent/physical_export.cpp @@ -41,7 +41,7 @@ static void WriteCatalogEntries(stringstream &ss, catalog_entry_vector_t &entrie } catch (const NotImplementedException &) { ss << entry.get().ToSQL(); } - ss << '\n'; + ss << ";\n"; } ss << '\n'; } diff --git a/src/execution/physical_plan/plan_insert.cpp b/src/execution/physical_plan/plan_insert.cpp index c960fe1130aa..d4cbc48801b9 100644 --- a/src/execution/physical_plan/plan_insert.cpp +++ b/src/execution/physical_plan/plan_insert.cpp @@ -10,7 +10,7 @@ namespace duckdb { -static OrderPreservationType OrderPreservationRecursive(PhysicalOperator &op) { +OrderPreservationType PhysicalPlanGenerator::OrderPreservationRecursive(PhysicalOperator &op) { if (op.IsSource()) { return op.SourceOrder(); } diff --git a/src/function/aggregate/sorted_aggregate_function.cpp b/src/function/aggregate/sorted_aggregate_function.cpp index 88941c040b7d..5e3747cb462f 100644 --- a/src/function/aggregate/sorted_aggregate_function.cpp +++ b/src/function/aggregate/sorted_aggregate_function.cpp @@ -9,7 +9,6 @@ #include "duckdb/planner/expression/bound_aggregate_expression.hpp" #include "duckdb/planner/expression/bound_constant_expression.hpp" #include "duckdb/parser/expression_map.hpp" -#include "duckdb/function/aggregate/distributive_functions.hpp" namespace duckdb { @@ -770,6 +769,17 @@ void FunctionBinder::BindSortedAggregate(ClientContext &context, BoundAggregateE } void FunctionBinder::BindSortedAggregate(ClientContext &context, BoundWindowExpression &expr) { + // Make implicit orderings explicit + auto &aggregate = *expr.aggregate; + if (aggregate.order_dependent == AggregateOrderDependent::ORDER_DEPENDENT && expr.arg_orders.empty()) { + for (auto &order : expr.orders) { + const auto type = order.type; + const auto null_order = order.null_order; + auto expression = order.expression->Copy(); + expr.arg_orders.emplace_back(BoundOrderByNode(type, null_order, std::move(expression))); + } + } + if (expr.arg_orders.empty() || expr.children.empty()) { // not a sorted aggregate: return return; @@ -781,7 +791,6 @@ void FunctionBinder::BindSortedAggregate(ClientContext &context, BoundWindowExpr return; } } - auto &aggregate = *expr.aggregate; auto &children = expr.children; auto &arg_orders = expr.arg_orders; auto sorted_bind = make_uniq(context, expr); diff --git a/src/function/window/window_index_tree.cpp b/src/function/window/window_index_tree.cpp index 5791b2af747f..48fb5b1bef0a 100644 --- a/src/function/window/window_index_tree.cpp +++ b/src/function/window/window_index_tree.cpp @@ -1,6 +1,5 @@ #include "duckdb/function/window/window_index_tree.hpp" -#include #include namespace duckdb { @@ -52,11 +51,21 @@ void WindowIndexTreeLocalState::BuildLeaves() { } } -idx_t WindowIndexTree::SelectNth(const SubFrames &frames, idx_t n) const { +pair WindowIndexTree::SelectNth(const SubFrames &frames, idx_t n) const { if (mst32) { - return mst32->NthElement(mst32->SelectNth(frames, n)); + const auto nth = mst32->SelectNth(frames, n); + if (nth.second) { + return nth; + } else { + return {mst32->NthElement(nth.first), 0}; + } } else { - return mst64->NthElement(mst64->SelectNth(frames, n)); + const auto nth = mst64->SelectNth(frames, n); + if (nth.second) { + return nth; + } else { + return {mst64->NthElement(nth.first), 0}; + } } } diff --git a/src/function/window/window_value_function.cpp b/src/function/window/window_value_function.cpp index 6b8a7038ebb3..fbd1551a2e50 100644 --- a/src/function/window/window_value_function.cpp +++ b/src/function/window/window_value_function.cpp @@ -311,7 +311,12 @@ void WindowLeadLagExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate, const auto n = NumericCast(val_idx); const auto nth_index = glstate.value_tree->SelectNth(frames, n); // (4) evaluate the expression provided to LEAD/LAG on this row. - cursor.CopyCell(0, nth_index, result, i); + if (nth_index.second) { + // Overflow + FlatVector::SetNull(result, i, true); + } else { + cursor.CopyCell(0, nth_index.first, result, i); + } } else if (wexpr.default_expr) { leadlag_default.CopyCell(result, i); } else { @@ -425,7 +430,8 @@ void WindowFirstValueExecutor::EvaluateInternal(WindowExecutorGlobalState &gstat if (frame_width) { const auto first_idx = gvstate.value_tree->SelectNth(frames, 0); - cursor.CopyCell(0, first_idx, result, i); + D_ASSERT(first_idx.second == 0); + cursor.CopyCell(0, first_idx.first, result, i); } else { FlatVector::SetNull(result, i, true); } @@ -474,8 +480,19 @@ void WindowLastValueExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate } if (frame_width) { - const auto last_idx = gvstate.value_tree->SelectNth(frames, frame_width - 1); - cursor.CopyCell(0, last_idx, result, i); + auto n = frame_width - 1; + auto last_idx = gvstate.value_tree->SelectNth(frames, n); + if (last_idx.second && last_idx.second <= n) { + // Frame larger than data. Since we want last, we back off by the overflow + n -= last_idx.second; + last_idx = gvstate.value_tree->SelectNth(frames, n); + } + if (last_idx.second) { + // No last value - give up. + FlatVector::SetNull(result, i, true); + } else { + cursor.CopyCell(0, last_idx.first, result, i); + } } else { FlatVector::SetNull(result, i, true); } @@ -541,7 +558,12 @@ void WindowNthValueExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate, if (n < frame_width) { const auto nth_index = gvstate.value_tree->SelectNth(frames, n - 1); - cursor.CopyCell(0, nth_index, result, i); + if (nth_index.second) { + // Past end of frame + FlatVector::SetNull(result, i, true); + } else { + cursor.CopyCell(0, nth_index.first, result, i); + } } else { FlatVector::SetNull(result, i, true); } diff --git a/src/include/duckdb/execution/merge_sort_tree.hpp b/src/include/duckdb/execution/merge_sort_tree.hpp index 672aaa56cd4a..8c04ecde03cf 100644 --- a/src/include/duckdb/execution/merge_sort_tree.hpp +++ b/src/include/duckdb/execution/merge_sort_tree.hpp @@ -118,7 +118,8 @@ struct MergeSortTree { void Build(); - idx_t SelectNth(const SubFrames &frames, idx_t n) const; + // {nth index, remainder} + pair SelectNth(const SubFrames &frames, idx_t n) const; inline ElementType NthElement(idx_t i) const { return tree.front().first[i]; @@ -436,10 +437,10 @@ void MergeSortTree::BuildRun(idx_t level_idx, idx_t run_idx) { } template -idx_t MergeSortTree::SelectNth(const SubFrames &frames, idx_t n) const { +pair MergeSortTree::SelectNth(const SubFrames &frames, idx_t n) const { // Handle special case of a one-element tree if (tree.size() < 2) { - return 0; + return {0, 0}; } // The first level contains a single run, @@ -566,7 +567,7 @@ idx_t MergeSortTree::SelectNth(const SubFrames &frames, idx_t n } } - return result; + return {result, n}; } template diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index a18a0eb5f0bd..dc73e17fc32c 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -195,6 +195,8 @@ class StringValueResult : public ScannerResult { bool projecting_columns = false; idx_t chunk_col_id = 0; + bool icu_loaded = false; + //! We must ensure that we keep the buffers alive until processing the query result unordered_map> buffer_handles; diff --git a/src/include/duckdb/execution/physical_plan_generator.hpp b/src/include/duckdb/execution/physical_plan_generator.hpp index ebd172492bc5..ee635c16adcd 100644 --- a/src/include/duckdb/execution/physical_plan_generator.hpp +++ b/src/include/duckdb/execution/physical_plan_generator.hpp @@ -74,6 +74,8 @@ class PhysicalPlanGenerator { static bool UseBatchIndex(ClientContext &context, PhysicalOperator &plan); //! Whether or not we should preserve insertion order for executing the given sink static bool PreserveInsertionOrder(ClientContext &context, PhysicalOperator &plan); + //! The order preservation type of the given operator decided by recursively looking at its children + static OrderPreservationType OrderPreservationRecursive(PhysicalOperator &op); template PhysicalOperator &Make(ARGS &&... args) { diff --git a/src/include/duckdb/function/window/window_index_tree.hpp b/src/include/duckdb/function/window/window_index_tree.hpp index e9f9f4014188..e95b522747d6 100644 --- a/src/include/duckdb/function/window/window_index_tree.hpp +++ b/src/include/duckdb/function/window/window_index_tree.hpp @@ -36,7 +36,8 @@ class WindowIndexTree : public WindowMergeSortTree { unique_ptr GetLocalState() override; //! Find the Nth index in the set of subframes - idx_t SelectNth(const SubFrames &frames, idx_t n) const; + //! Returns {nth index, 0} or {nth offset, overflow} + pair SelectNth(const SubFrames &frames, idx_t n) const; }; } // namespace duckdb diff --git a/src/include/duckdb/main/extension_entries.hpp b/src/include/duckdb/main/extension_entries.hpp index 89108ef34bca..6f86738a66a6 100644 --- a/src/include/duckdb/main/extension_entries.hpp +++ b/src/include/duckdb/main/extension_entries.hpp @@ -478,6 +478,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"quantile_disc", "core_functions", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"radians", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, {"random", "core_functions", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"read_avro", "avro", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json_auto", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json_objects", "json", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -1086,16 +1087,28 @@ static constexpr ExtensionEntry EXTENSION_SECRET_PROVIDERS[] = { {"mysql/config", "mysql_scanner"}, {"postgres/config", "postgres_scanner"}}; // EXTENSION_SECRET_PROVIDERS -static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {"aws", "azure", - "autocomplete", "core_functions", - "delta", "excel", - "fts", "httpfs", - "iceberg", "inet", - "icu", "json", - "motherduck", "mysql_scanner", - "parquet", "sqlite_scanner", - "sqlsmith", "postgres_scanner", - "tpcds", "tpch", - "uc_catalog", "ui"}; // END_OF_AUTOLOADABLE_EXTENSIONS +static constexpr const char *AUTOLOADABLE_EXTENSIONS[] = {"avro", + "aws", + "azure", + "autocomplete", + "core_functions", + "delta", + "excel", + "fts", + "httpfs", + "iceberg", + "inet", + "icu", + "json", + "motherduck", + "mysql_scanner", + "parquet", + "sqlite_scanner", + "sqlsmith", + "postgres_scanner", + "tpcds", + "tpch", + "uc_catalog", + "ui"}; // END_OF_AUTOLOADABLE_EXTENSIONS } // namespace duckdb diff --git a/src/parser/parsed_data/create_type_info.cpp b/src/parser/parsed_data/create_type_info.cpp index 1ce0327c3c31..4ee92f525a3f 100644 --- a/src/parser/parsed_data/create_type_info.cpp +++ b/src/parser/parsed_data/create_type_info.cpp @@ -61,6 +61,7 @@ string CreateTypeInfo::ToString() const { result += " AS "; result += type.ToString(); } + result += ";"; return result; } diff --git a/src/storage/statistics/numeric_stats.cpp b/src/storage/statistics/numeric_stats.cpp index a9379812292e..4283ea78988e 100644 --- a/src/storage/statistics/numeric_stats.cpp +++ b/src/storage/statistics/numeric_stats.cpp @@ -147,6 +147,7 @@ FilterPropagateResult CheckZonemapTemplated(const BaseStatistics &stats, Express T max_value, T constant) { switch (comparison_type) { case ExpressionType::COMPARE_EQUAL: + case ExpressionType::COMPARE_NOT_DISTINCT_FROM: if (ConstantExactRange(min_value, max_value, constant)) { return FilterPropagateResult::FILTER_ALWAYS_TRUE; } @@ -155,6 +156,7 @@ FilterPropagateResult CheckZonemapTemplated(const BaseStatistics &stats, Express } return FilterPropagateResult::FILTER_ALWAYS_FALSE; case ExpressionType::COMPARE_NOTEQUAL: + case ExpressionType::COMPARE_DISTINCT_FROM: if (!ConstantValueInRange(min_value, max_value, constant)) { return FilterPropagateResult::FILTER_ALWAYS_TRUE; } else if (ConstantExactRange(min_value, max_value, constant)) { diff --git a/src/storage/statistics/string_stats.cpp b/src/storage/statistics/string_stats.cpp index 62262d448349..230944ab1446 100644 --- a/src/storage/statistics/string_stats.cpp +++ b/src/storage/statistics/string_stats.cpp @@ -216,12 +216,14 @@ FilterPropagateResult StringStats::CheckZonemap(const_data_ptr_t min_data, idx_t int max_comp = StringValueComparison(data, MinValue(max_len, size), max_data); switch (comparison_type) { case ExpressionType::COMPARE_EQUAL: + case ExpressionType::COMPARE_NOT_DISTINCT_FROM: if (min_comp >= 0 && max_comp <= 0) { return FilterPropagateResult::NO_PRUNING_POSSIBLE; } else { return FilterPropagateResult::FILTER_ALWAYS_FALSE; } case ExpressionType::COMPARE_NOTEQUAL: + case ExpressionType::COMPARE_DISTINCT_FROM: if (min_comp < 0 || max_comp > 0) { return FilterPropagateResult::FILTER_ALWAYS_TRUE; } diff --git a/test/optimizer/pushdown/distinct_from_pushdown.test b/test/optimizer/pushdown/distinct_from_pushdown.test new file mode 100644 index 000000000000..fa50f3094948 --- /dev/null +++ b/test/optimizer/pushdown/distinct_from_pushdown.test @@ -0,0 +1,27 @@ +# name: test/optimizer/pushdown/distinct_from_pushdown.test +# description: Test DISTINCT FROM pushed down into scans +# group: [pushdown] + +statement ok +create table test as select 'tst' as tst; + +query I +select * from test where tst is not distinct from 'a' or tst is not distinct from 'b'; +---- + +query I +select * from test where tst is distinct from 'a' or tst is distinct from 'b'; +---- +tst + +statement ok +create table test2 as select 42 as tst; + +query I +select * from test2 where tst is not distinct from 12 or tst is not distinct from 13; +---- + +query I +select * from test2 where tst is distinct from 12 or tst is distinct from 13 +---- +42 diff --git a/test/sql/copy/csv/afl/test_fuzz_4086.test b/test/sql/copy/csv/afl/test_fuzz_4086.test new file mode 100644 index 000000000000..08d2d36a80aa --- /dev/null +++ b/test/sql/copy/csv/afl/test_fuzz_4086.test @@ -0,0 +1,20 @@ +# name: test/sql/copy/csv/afl/test_fuzz_4086.test +# description: fuzzer generated csv files - should not raise internal exception (by failed assertion). +# group: [afl] + +require json + +statement ok +PRAGMA enable_verification + +statement maybe +FROM read_csv('data/csv/afl/4086/case_1.csv', auto_detect=false, columns={'json': 'JSON'}, delim=NULL, buffer_size=42, store_rejects=true, rejects_limit=658694493994253607); +---- + +statement maybe +FROM read_csv('data/csv/afl/4086/case_2.csv', auto_detect=false, columns={'json': 'JSON'}, delim=NULL, buffer_size=42, store_rejects=true, rejects_limit=658694493994253607); +---- + +statement maybe +FROM read_csv('data/csv/afl/4086/case_3.csv', auto_detect=false, columns={'json': 'JSON'}, delim='\0', buffer_size=42, store_rejects=true, rejects_limit=658694493994253607); +---- diff --git a/test/sql/window/test_value_orderby.test b/test/sql/window/test_value_orderby.test index 8b51198f277c..8fbb3e9e4cd1 100644 --- a/test/sql/window/test_value_orderby.test +++ b/test/sql/window/test_value_orderby.test @@ -29,3 +29,19 @@ ORDER BY 2 9 8 9 1 7 6 9 9 1 6 3 10 9 1 6 + +# Frame larger than data +query I +with IDS as ( + select * as idx from generate_series(1,4) +),DATA as ( + select *, (case when idx != 3 then idx * 1.0 else NULL end) as value from IDS +) +SELECT + last(value ORDER BY idx IGNORE NULLS) OVER (ORDER BY idx ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING) +FROM DATA +---- +1.0 +2.0 +2.0 +4.0 diff --git a/test/sql/window/test_window_constant_aggregate.test b/test/sql/window/test_window_constant_aggregate.test index 6aafb23d0742..f16d221152b9 100644 --- a/test/sql/window/test_window_constant_aggregate.test +++ b/test/sql/window/test_window_constant_aggregate.test @@ -205,7 +205,7 @@ ORDER BY ALL statement ok pragma threads=2 -loop i 0 100 +loop i 0 20 query III with table_1 AS ( @@ -285,3 +285,27 @@ fb30cf47-6f6b-42ef-dec2-3f984479a2aa 2024-04-01 00:00:00 12 7d1cc557-2d45-6900-a1ed-b2c64f5d9200 2024-02-01 00:00:00 NULL endloop + +# Test implicit ordering for aggregates +loop i 0 20 + +query I +with repro2 AS ( + SELECT range // 59 AS id, random() AS value + FROM range(1475) +), X AS ( + SELECT + list(value) OVER ( + PARTITION BY id + ORDER BY value + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS values + FROM repro2 +) +select count(*) +from X +where values[1] != list_aggregate(values, 'min') +---- +0 + +endloop diff --git a/third_party/fsst/fsst.h b/third_party/fsst/fsst.h index 8e143db8782a..86909be5b5f2 100644 --- a/third_party/fsst/fsst.h +++ b/third_party/fsst/fsst.h @@ -196,7 +196,7 @@ duckdb_fsst_decompress( } } } - if (posOut+24 <= size) { // handle the possibly 3 last bytes without a loop + if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop if (posIn+2 <= lenIn) { strOut[posOut] = strIn[posIn+1]; if (strIn[posIn] != FSST_ESC) { diff --git a/tools/shell/shell.cpp b/tools/shell/shell.cpp index 72c7cf55f020..ba31af324940 100644 --- a/tools/shell/shell.cpp +++ b/tools/shell/shell.cpp @@ -1736,11 +1736,7 @@ void ShellState::ExecutePreparedStatement(sqlite3_stmt *pStmt) { /* extract the data and data types */ for (int i = 0; i < nCol; i++) { result.types[i] = sqlite3_column_type(pStmt, i); - if (result.types[i] == SQLITE_BLOB && cMode == RenderMode::INSERT) { - result.data[i] = ""; - } else { - result.data[i] = (const char *)sqlite3_column_text(pStmt, i); - } + result.data[i] = (const char *)sqlite3_column_text(pStmt, i); if (!result.data[i] && result.types[i] != SQLITE_NULL) { // OOM rc = SQLITE_NOMEM; diff --git a/tools/shell/tests/test_shell_basics.py b/tools/shell/tests/test_shell_basics.py index e2c554bb5a2d..1523ac2ccef2 100644 --- a/tools/shell/tests/test_shell_basics.py +++ b/tools/shell/tests/test_shell_basics.py @@ -834,6 +834,17 @@ def test_dump_mixed(shell): result = test.run() result.check_stdout('CREATE TABLE a(d DATE, k FLOAT, t TIMESTAMP);') +def test_dump_blobs(shell): + test = ( + ShellTest(shell) + .statement("create table test(t VARCHAR, b BLOB);") + .statement(".changes off") + .statement("insert into test values('literal blob', '\\x07\\x08\\x09');") + .statement(".dump") + ) + result = test.run() + result.check_stdout("'\\x07\\x08\\x09'") + def test_invalid_csv(shell, tmp_path): file = tmp_path / 'nonsencsv.csv' with open(file, 'wb+') as f: @@ -869,18 +880,6 @@ def test_mode_trash(shell): result = test.run() result.check_stdout('') -@pytest.mark.skip(reason="Broken test, ported directly, was commented out") -def test_dump_blobs(shell): - test = ( - ShellTest(shell) - .statement("CREATE TABLE a (b BLOB);") - .statement(".changes off") - .statement("INSERT INTO a VALUES (DATE '1992-01-01', 0.3, NOW());") - .statement(".dump") - ) - result = test.run() - result.check_stdout('COMMIT') - def test_sqlite_comments(shell): # Using /* */ test = (