A git2rdata object consists of two files. The ".tsv" file contains the raw data as a plain text tab separated file. The ".yml" contains the metadata on the columns in plain text YAML format. See vignette("plain text", package = "git2rdata") for more details on the implementation.

write_vc(x, file, root = ".", sorting, strict = TRUE,
  optimize = TRUE, na = "NA", ...)

# S3 method for git_repository
write_vc(x, file, root, sorting, strict = TRUE,
  optimize = TRUE, na = "NA", ..., stage = FALSE, force = FALSE)

Arguments

x

the data.frame.

file

the name of the git2rdata object. Git2rdata objects cannot have dots in their name. The name may include a relative path. file is a path relative to the root. Note that file must point to a location within root.

root

The root of a project. Can be a file path or a git-repository. Defaults to the current working directory (".").

sorting

an optional vector of column names defining which columns to use for sorting x and in what order to use them. Omitting sorting yields a warning. Add sorting to avoid this warning. Strongly recommended in combination with version control. See vignette("efficiency", package = "git2rdata") for an illustration of the importance of sorting.

strict

What to do when the metadata changes. strict = FALSE overwrites the data and the metadata with a warning listing the changes, strict = TRUE returns an error and leaves the data and metadata as is. Defaults to TRUE.

optimize

If TRUE, recode the data to get smaller text files. If FALSE, meta() converts the data to character. Defaults to TRUE.

na

the string to use for missing values in the data.

...

parameters used in some methods

stage

Logical value indicating whether to stage the changes after writing the data. Defaults to FALSE.

force

Add ignored files. Default is FALSE.

Value

a named vector with the file paths relative to root. The names contain the hashes of the files.

Note

..generic is a reserved name for the metadata and is a forbidden column name in a data.frame.

See also

Examples

## on file system # create a directory root <- tempfile("git2rdata-") dir.create(root) # write a dataframe to the directory write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length")
#> 11d423327dc1f67e16183dbd9871ad931d3b9689 #> "iris.tsv" #> bcf0c81601329774ce177b76919f64d509131788 #> "iris.yml"
# check that a data file (.tsv) and a metadata file (.yml) exist. list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# read the git2rdata object from the directory read_vc("iris", root)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 4.6 3.1 1.5 0.2 setosa #> 2 4.7 3.2 1.3 0.2 setosa #> 3 4.9 3.0 1.4 0.2 setosa #> 4 5.0 3.6 1.4 0.2 setosa #> 5 5.1 3.5 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa
# store a new version with different observations but the same metadata write_vc(iris[1:5, ], "iris", root)
#> 5bf24c9e9b6830eebd749bac23ce599774d4087e #> "iris.tsv" #> aa9c8665def5a9644ec8157943631f63ac46e5c1 #> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# Removing a column requires version requires new metadata. # Add strict = FALSE to override the existing metadata. write_vc( iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE )
#> Warning: Changes in the metadata may lead to unnecessarily large diffs. #> See vignette('version_control', package = 'git2rdata') for more information. #> #> - New data has a different number of variables. #> - Deleted variables: Sepal.Width.
#> 1a649561f36566acf21c2f167e69abc54907badb #> "iris.tsv" #> bf5ce7f305eafd190fc6354181c7dac15b5494fb #> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# storing the orignal version again requires another update of the metadata write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE)
#> Warning: Changes in the metadata may lead to unnecessarily large diffs. #> See vignette('version_control', package = 'git2rdata') for more information. #> #> - The sorting variables changed. #> - Sorting for the new data: 'Sepal.Width'. #> - Sorting for the old data: 'Sepal.Length'. #> - New data has a different number of variables. #> - New variables: Sepal.Width.
#> b13fe258fa3e684284e1e13a081dcc216672500e #> "iris.tsv" #> 1a12d45cfbec489feae910be9e411d5712a58830 #> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# optimize = FALSE stores the data more verbose. This requires larger files. write_vc( iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE )
#> 965536eea40ded6da531ade9fb89ea79a423810d #> "iris2.tsv" #> 1b3ded16c33ab6ad9fe70f77bc3709ac7abfd1b5 #> "iris2.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml" "iris2.tsv" "iris2.yml"
## on git repo using a git2r::git-repository # initialise a git repo using the git2r package repo_path <- tempfile("git2rdata-repo-") dir.create(repo_path) repo <- git2r::init(repo_path) git2r::config(repo, user.name = "Alice", user.email = "alice@example.org") # store a dataframe in git repo. write_vc(iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length")
#> 11d423327dc1f67e16183dbd9871ad931d3b9689 #> "iris.tsv" #> bcf0c81601329774ce177b76919f64d509131788 #> "iris.yml"
# This git2rdata object is not staged by default. status(repo)
#> Untracked files: #> Untracked: iris.tsv #> Untracked: iris.yml #>
# read a dataframe from a git repo read_vc("iris", repo)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 4.6 3.1 1.5 0.2 setosa #> 2 4.7 3.2 1.3 0.2 setosa #> 3 4.9 3.0 1.4 0.2 setosa #> 4 5.0 3.6 1.4 0.2 setosa #> 5 5.1 3.5 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa
# store a new version in the git repo and stage it in one go write_vc(iris[1:5, ], "iris", repo, stage = TRUE)
#> 5bf24c9e9b6830eebd749bac23ce599774d4087e #> "iris.tsv" #> aa9c8665def5a9644ec8157943631f63ac46e5c1 #> "iris.yml"
status(repo)
#> Staged changes: #> New: iris.tsv #> New: iris.yml #>
# store a verbose version in a different gir2data object write_vc( iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE )
#> 965536eea40ded6da531ade9fb89ea79a423810d #> "iris2.tsv" #> 1b3ded16c33ab6ad9fe70f77bc3709ac7abfd1b5 #> "iris2.yml"
status(repo)
#> Untracked files: #> Untracked: iris2.tsv #> Untracked: iris2.yml #> #> Staged changes: #> New: iris.tsv #> New: iris.yml #>
# clean up junk <- file.remove( list.files(root, full.names = TRUE, recursive = TRUE), root) junk <- file.remove( rev(list.files(repo_path, full.names = TRUE, recursive = TRUE, include.dirs = TRUE, all.files = TRUE)), repo_path)