Store a Data.Frame as a Git2rdata Object on Disk

A git2rdata object consists of two files. The ".tsv" file contains the raw data as a plain text tab separated file. The ".yml" contains the metadata on the columns in plain text YAML format. See vignette("plain text", package = "git2rdata") for more details on the implementation.

Usage

write_vc(
  x,
  file,
  root = ".",
  sorting,
  strict = TRUE,
  optimize = TRUE,
  na = "NA",
  ...,
  split_by,
  convert
)

# S3 method for class 'character'
write_vc(
  x,
  file,
  root = ".",
  sorting,
  strict = TRUE,
  optimize = TRUE,
  na = "NA",
  ...,
  append = FALSE,
  split_by = character(0),
  digits,
  convert = list()
)

# S3 method for class 'git_repository'
write_vc(
  x,
  file,
  root,
  sorting,
  strict = TRUE,
  optimize = TRUE,
  na = "NA",
  ...,
  stage = FALSE,
  force = FALSE
)

Arguments

x: the data.frame.
file: the name of the git2rdata object. Git2rdata objects cannot have dots in their name. The name may include a relative path. file is a path relative to the root. Note that file must point to a location within root.
root: The root of a project. Can be a file path or a git-repository. Defaults to the current working directory (".").
sorting: an optional vector of column names defining which columns to use for sorting x and in what order to use them. The default empty sorting yields a warning. Add sorting to avoid this warning. Strongly recommended in combination with version control. See vignette("efficiency", package = "git2rdata") for an illustration of the importance of sorting.
strict: What to do when the metadata changes. strict = FALSE overwrites the data and the metadata with a warning listing the changes, strict = TRUE returns an error and leaves the data and metadata as is. Defaults to TRUE.
optimize: If TRUE, recode the data to get smaller text files. If FALSE, meta() converts the data to character. Defaults to TRUE.
na: the string to use for missing values in the data.
...: parameters used in some methods
split_by: An optional vector of variables name to split the text files. This creates a separate file for every combination. We prepend these variables to the vector of sorting variables.
convert: An optional named list for column conversions. Names must be present in the column names of x. Each element must be a character vector of length 2 with names write and read, containing function names in the package::function format. The write function is applied before storing, and read function is applied when reading back the data.
append: logical. Only relevant if file is a character string. If TRUE, the output is appended to the file. If FALSE, any existing file of the name is destroyed.
digits: The number of significant digits of the smallest absolute value. The function applies the rounding automatically. Only relevant for numeric variables. Either a single positive integer or a named vector where the names link to the variables in the data.frame. Defaults to 6 with a warning.
stage: Logical value indicating whether to stage the changes after writing the data. Defaults to FALSE.
force: Add ignored files. Default is FALSE.

Value

a named vector with the file paths relative to root. The names contain the hashes of the files.

Note

..generic is a reserved name for the metadata and is a forbidden column name in a data.frame.

Examples

## on file system

# create a directory
root <- tempfile("git2rdata-")
dir.create(root)

# write a dataframe to the directory
write_vc(
  iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length",
  digits = 6
)
#> 09d5bfd6a65e682a4ca030c766348180861568c8 
#>                               "iris.tsv" 
#> f5eda4fcbe143eefc267a51a511110c604848272 
#>                               "iris.yml" 
# check that a data file (.tsv) and a metadata file (.yml) exist.
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# read the git2rdata object from the directory
read_vc("iris", root)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          4.6         3.1          1.5         0.2  setosa
#> 2          4.7         3.2          1.3         0.2  setosa
#> 3          4.9         3.0          1.4         0.2  setosa
#> 4          5.0         3.6          1.4         0.2  setosa
#> 5          5.1         3.5          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
#> 
#> Use `display_metadata()` to view the metadata.

# store a new version with different observations but the same metadata
write_vc(iris[1:5, ], "iris", root)
#> 31ff841b58e569e8a4a4ac2f02152295c19f94db 
#>                               "iris.tsv" 
#> f5eda4fcbe143eefc267a51a511110c604848272 
#>                               "iris.yml" 
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# Removing a column requires version requires new metadata.
# Add strict = FALSE to override the existing metadata.
write_vc(
  iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE
)
#> Warning: Changes in the metadata may lead to unnecessarily large diffs.
#> See vignette('version_control', package = 'git2rdata') for more information.
#> 
#> - New data has a different number of variables.
#> - Deleted variables: Sepal.Width.
#> b2098d507b0d749a86bb61a185ab2d31f7622418 
#>                               "iris.tsv" 
#> 08179205a52ffe296818ef844180093eaaadfe00 
#>                               "iris.yml" 
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# storing the original version again requires another update of the metadata
write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE)
#> Warning: Changes in the metadata may lead to unnecessarily large diffs.
#> See vignette('version_control', package = 'git2rdata') for more information.
#> 
#> - The sorting variables changed.
#>     - Sorting for the new data: 'Sepal.Width'.
#>     - Sorting for the old data: 'Sepal.Length'.
#> - New data has a different number of variables.
#> - New variables: Sepal.Width.
#> 4045436d3a61801f4eaad5769e32726838deecbc 
#>                               "iris.tsv" 
#> 4e0919ca66a485cf0e198981782f3cd122d10fef 
#>                               "iris.yml" 
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# optimize = FALSE stores the data more verbose. This requires larger files.
write_vc(
  iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE,
  digits = 6
)
#> 79547bc5fecc2c82bd01988d1591130e578fdcf9 
#>                              "iris2.csv" 
#> 4f86db2012b3267f1a50131945158aead6d918ec 
#>                              "iris2.yml" 
list.files(root, recursive = TRUE)
#> [1] "iris.tsv"  "iris.yml"  "iris2.csv" "iris2.yml"



## on git repo using a git2r::git-repository

# initialise a git repo using the git2r package
repo_path <- tempfile("git2rdata-repo-")
dir.create(repo_path)
repo <- git2r::init(repo_path)
git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")

# store a dataframe in git repo.
write_vc(
  iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length",
  digits = 6
)
#> 09d5bfd6a65e682a4ca030c766348180861568c8 
#>                               "iris.tsv" 
#> f5eda4fcbe143eefc267a51a511110c604848272 
#>                               "iris.yml" 
# This git2rdata object is not staged by default.
status(repo)
#> Untracked files:
#> 	Untracked:  iris.tsv
#> 	Untracked:  iris.yml
#> 
# read a dataframe from a git repo
read_vc("iris", repo)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          4.6         3.1          1.5         0.2  setosa
#> 2          4.7         3.2          1.3         0.2  setosa
#> 3          4.9         3.0          1.4         0.2  setosa
#> 4          5.0         3.6          1.4         0.2  setosa
#> 5          5.1         3.5          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
#> 
#> Use `display_metadata()` to view the metadata.

# store a new version in the git repo and stage it in one go
write_vc(iris[1:5, ], "iris", repo, stage = TRUE)
#> 31ff841b58e569e8a4a4ac2f02152295c19f94db 
#>                               "iris.tsv" 
#> f5eda4fcbe143eefc267a51a511110c604848272 
#>                               "iris.yml" 
status(repo)
#> Staged changes:
#> 	New:        iris.tsv
#> 	New:        iris.yml
#> 

# store a verbose version in a different gir2data object
write_vc(
  iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE,
  digits = 6
)
#> 79547bc5fecc2c82bd01988d1591130e578fdcf9 
#>                              "iris2.csv" 
#> 4f86db2012b3267f1a50131945158aead6d918ec 
#>                              "iris2.yml" 
status(repo)
#> Untracked files:
#> 	Untracked:  iris2.csv
#> 	Untracked:  iris2.yml
#> 
#> Staged changes:
#> 	New:        iris.tsv
#> 	New:        iris.yml
#>