diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..8dd399a
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..3225814
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Enforce Linux-style line endings for all text files
+* text=auto eol=lf
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..6f00ac0
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,92 @@
+name: CI
+
+on:
+ push:
+ pull_request:
+
+jobs:
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install lint dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[dev]
+ - name: Lint
+ run: flake8 src tests tools
+ - name: Validate packaged registry
+ run: python tools/check_registry.py
+ - name: Validate notebooks
+ run: python tools/check_notebooks.py
+ - name: Check notebook exports
+ run: python tools/export_notebooks.py --check
+ - name: Check README sync
+ run: python tools/gen_readme.py --check
+
+ docs-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install docs extras
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[docs]
+ - name: Export notebooks and README
+ run: |
+ python tools/export_notebooks.py --check
+ python tools/gen_readme.py --check
+ - name: Build docs
+ run: mkdocs build --strict
+
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[test]
+ - name: Test
+ run: pytest
+
+ build-dist:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install build dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install build twine
+ - name: Build distributions
+ run: python -m build
+ - name: Validate metadata
+ run: python -m twine check dist/*
+ - name: Check packaged files
+ run: python tools/check_dist.py dist
+ - name: Install built wheel and smoke-test it
+ run: |
+ python -m pip install --force-reinstall --no-deps dist/*.whl
+ python - <<'PY'
+ import atomref as ar
+
+ assert ar.get_covalent_radius('C') == 0.76
+ assert ar.get_vdw_radius('C') == 1.77
+ assert 'atomic_radius' in ar.list_quantities()
+ assert 'rahm2016' in ar.list_dataset_ids('atomic_radius', usage_role='support')
+ PY
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..418ce0d
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,25 @@
+name: Docs
+
+on:
+ push:
+ branches: [main, master]
+ workflow_dispatch:
+
+jobs:
+ build-docs:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install docs extras
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[docs]
+ - name: Check generated files
+ run: |
+ python tools/export_notebooks.py --check
+ python tools/gen_readme.py --check
+ - name: Build docs
+ run: mkdocs build --strict
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..18d2c3a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,127 @@
+# Changelog
+
+## 0.1.4 - 2026-03-15
+
+### Added
+
+- `LookupResult.transfer_depth`, which records how many transfer steps were
+ involved in the returned numeric value.
+- Explicit nested-policy safeguards for `LinearTransfer` via:
+ - `fit_sources`
+ - `fit_max_depth`
+ - `prediction_sources`
+ - `prediction_max_depth`
+- Regression tests covering generic-policy cycles, wrapper-policy cycles,
+ conservative nested-fit defaults, and explicit opt-in for deeper nested
+ linear workflows.
+
+### Changed
+
+- Nested policy-backed linear transfers are now guarded in two phases:
+ conservative defaults are used for fit training, while one additional nested
+ completion step remains allowed at prediction time.
+- Linear-transfer fitting now distinguishes direct predictor values from nested
+ policy-derived predictor values.
+- Cycle detection now tracks both generic policies and wrapper policies using a
+ context-local activation stack, so recursion through freshly materialized
+ wrapper policies is detected reliably and safely.
+- Radii and X–H convenience helpers now resolve through wrapper-aware cycle
+ tracking rather than materializing a fresh generic policy for each public
+ lookup call.
+
+### Documentation
+
+- Expanded the transfer and policy docs to explain nested-policy safeguards,
+ `transfer_depth`, and cycle detection.
+- Added guidance on when chained correlations are scientifically reasonable and
+ how to opt in deliberately when broader fit training is desired.
+
+## 0.1.3 - 2026-03-15
+
+### Added
+
+- Support for using generic policies and wrapper policies as transfer sources in
+ `SubstitutionTransfer` and `LinearTransfer`.
+- Public `atomref.xh` module docs and examples for policy-backed predictor
+ workflows.
+
+### Changed
+
+- `LinearTransfer` now treats predictors as **sources** rather than only raw
+ datasets, while still keeping the current runtime to one predictor at a time.
+- Generic policy resolution now supports blocked element keys, which is used by
+ the X–H helper to prevent invalid `H` parent-element lookups.
+- Transfer results now preserve nested-policy provenance through
+ `resolved_from` and explanatory notes when a policy source is involved.
+
+## 0.1.2 - 2026-03-15
+
+### Added
+
+- New `xh_bond_length` quantity family.
+- Packaged provisional X–H dataset `csd_legacy_xh_cno` with ConQuest/CSD
+ hydrogen-normalisation targets for `C`, `N`, and `O`.
+- New `atomref.xh` convenience layer with `XHPolicy`, `DEFAULT_XH_POLICY`, set
+ listing helpers, and X–H lookup helpers.
+
+### Documentation
+
+- Added X–H dataset and API pages.
+- Documented the provisional scope of X–H support in `0.1.x` and the planned
+ broader follow-up in `0.2.x`.
+
+## 0.1.1 - 2026-03-15
+
+### Added
+
+- Public generic lookup helpers `lookup_value(...)` and `get_value(...)`.
+- Tests for alias normalization, immutable metadata, non-finite-value rejection,
+ collision detection, and explicit placeholder notes.
+
+### Changed
+
+- Registry metadata returned by `get_dataset_info(...)` is now frozen so callers
+ cannot mutate the cached registry state.
+- Dataset-alias resolution now normalizes Unicode and dash variants more
+ robustly.
+- Custom-set construction and policy configuration now reject normalized-key
+ collisions and non-finite numeric values.
+- Radii-specific wrappers now reject negative override and fallback values.
+- Base and substitution lookups now emit explicit placeholder notes when the
+ returned numeric value is a dataset placeholder.
+- `LinearTransfer` now validates empty-predictor and invalid-`min_points`
+ configurations eagerly.
+- The docs now explain the distinction between quantity, domain, dataset, and
+ policy, and clarify that the current runtime supports only the `element`
+ domain.
+
+## 0.1.0 - 2026-03-15
+
+First public release.
+
+### Added
+
+- Packaged element metadata and curated radii tables.
+- Quantity-aware registry metadata that separates operational lookup quantity
+ from scientific classification and dataset usage role.
+- Provenance-aware radii policies with deterministic resolution order.
+- Substitution and linear-transfer support for restoring missing values from
+ curated support datasets.
+- Public helpers for inspecting quantities, dataset metadata, and packaged
+ built-in sets.
+- Runnable notebooks together with generated Markdown notebook pages in the
+ documentation.
+- Validation and maintenance tools for registry checks, notebook export, README
+ generation, and distribution-artifact inspection.
+
+### Documentation
+
+- Expanded dataset guides with citations and selection-oriented descriptions.
+- Added module-level API pages and notebook walkthroughs.
+- Added developer-facing curation and tooling notes.
+
+### Packaging
+
+- Built and validated wheel and source-distribution artifacts.
+- Added CI coverage for linting, tests, docs builds, notebook sync, and
+ distribution checks.
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/DEV_PLAN.md b/DEV_PLAN.md
new file mode 100644
index 0000000..94cdaac
--- /dev/null
+++ b/DEV_PLAN.md
@@ -0,0 +1,33 @@
+# Development plan
+
+## Current status (implemented in the `0.1.x` line)
+
+- stable element metadata
+- curated covalent, van der Waals, and atomic-radius support datasets
+- explicit provenance and coverage metadata
+- generic value-policy core plus radii and X–H convenience wrappers
+- substitution and linear transfer
+- custom element-indexed scalar sets
+- policy-backed transfer sources
+- nested-policy safeguards, transfer-depth tracking, and cycle detection
+- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and
+ `DEFAULT_XH_POLICY`
+
+## Planned for `0.2.x`
+
+- broader X–H datasets and policies
+- experimental plus computational support sets
+- pairwise helper logic such as reference sums and normalization schemes
+- restoration of incomplete experimental data from broader-support predictors
+
+## Longer-term design ideas
+
+- radial atomic reference functions
+- simple proto-density support based on spherically averaged atomic data
+
+## Possible future directions
+
+- more radii sets
+- uncertainty and confidence flags
+- ion-specific or atom-type-specific domains
+- density-derived radii and related reference transforms
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a04128
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/NOTICE.md b/NOTICE.md
new file mode 100644
index 0000000..01f1cf1
--- /dev/null
+++ b/NOTICE.md
@@ -0,0 +1,12 @@
+# atomref
+
+atomref is a Python library for curated atomic reference data and transfer
+policies for geometry and structure-analysis algorithms.
+
+Copyright (c) 2026 Ivan Chernyshov
+License: LGPL-3.0-or-later (see LICENSE and COPYING)
+
+## Third-party material
+
+The initial scaffold reuses and adapts data tables and design ideas from the
+Delone Commons `molcryst` repository, also authored by Ivan Chernyshov.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..869aace
--- /dev/null
+++ b/README.md
@@ -0,0 +1,173 @@
+# atomref
+
+[](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml)
+[](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml)
+[](https://pypi.org/project/atomref/)
+[](https://pypi.org/project/atomref/)
+[](https://github.com/DeloneCommons/atomref/blob/main/LICENSE)
+
+`atomref` is a small pure-Python package for **curated atomic reference data**
+and **provenance-aware lookup policies** used by geometry and
+structure-analysis algorithms.
+
+It is not meant to be yet another periodic-table encyclopedia. The package is
+for code that needs stable atomic reference values with explicit provenance,
+clear fallback behavior, and honest handling of incomplete preferred datasets.
+
+What you get in the current release line:
+
+- stable element metadata,
+- curated named radii sets,
+- provisional X–H bond-length support for hydrogen-normalisation workflows,
+- dataset provenance and coverage metadata,
+- deterministic lookup policies,
+- substitution and linear transfer from support datasets or policies into target datasets,
+- guarded nested policy-backed transfers with explicit transfer depth,
+ conservative fit/prediction controls, and cycle detection,
+- user-defined custom element-indexed scalar sets.
+
+## Core terms
+
+`atomref` uses a small vocabulary on purpose.
+
+- **quantity** — the operational property family being requested, such as
+ `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or
+ `xh_bond_length`.
+- **domain** — the key space used to index that quantity. In the current
+ runtime, the supported domain is `element`, meaning lookups are keyed by an
+ element symbol.
+- **dataset** — one curated named table inside a quantity, such as
+ `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`.
+- **policy** — the ordered rule set that decides what value to return when the
+ preferred dataset is incomplete.
+
+The metadata layer already records `domain` explicitly because the package is
+built for later extension, but the current runtime intentionally keeps the
+implementation narrow and stable: **the current runtime resolves only
+element-domain scalar values**.
+
+## Why this exists
+
+Scientific software often wants a complete lookup table, but the best dataset
+for the job is rarely complete. `atomref` makes that situation explicit.
+Instead of hiding ad hoc defaults inside algorithm code, you choose a target
+set, describe how missing values may be restored, and keep provenance on what
+was actually returned.
+
+The built-in default behavior is intentionally simple and practical:
+
+- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target
+ set, with missing values substituted from the **legacy CSD covalent radii**
+ (`csd_legacy_cov`).
+- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target
+ set, with missing values restored from the **Rahm isodensity atomic radii**
+ (`rahm2016`) through a fitted linear transfer.
+- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a
+ provisional sparse X–H target set for `C`, `N`, and `O`, with other parent
+ elements inferred from **Cordero covalent radii** through a fitted linear
+ transfer.
+
+Nested policy predictors are supported too. `LinearTransfer` separates
+**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested
+values, while the final requested element may still use one additional
+nested completion step. That is a useful compromise for workflows such as
+provisional X–H inference from a chosen covalent-radii policy.
+
+## Quick example
+
+```pycon
+>>> import atomref as ar
+>>> ar.get_covalent_radius("C")
+0.76
+>>> ar.get_vdw_radius("O")
+1.5
+>>> ar.get_xh_bond_length("N")
+1.015
+>>> lookup = ar.lookup_vdw_radius("Pm")
+>>> lookup.value
+2.8972265395148358
+>>> lookup.source
+'transfer_linear'
+>>> lookup.transfer_depth
+1
+>>> lookup.resolved_from
+(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),)
+```
+
+`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also
+records where the value came from, whether a transfer model or policy source was
+involved, and how many transfer steps were needed (`transfer_depth`).
+
+You can inspect the packaged quantity and dataset catalog directly:
+
+```pycon
+>>> import atomref as ar
+>>> ar.list_quantities()
+('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length')
+>>> ar.get_quantity_info("xh_bond_length")
+QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.')
+>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")]
+['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020']
+```
+
+You can also load a packaged set directly:
+
+```pycon
+>>> import atomref as ar
+>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013")
+>>> vdw.get("O")
+1.5
+>>> xh = ar.get_xh_set("csd_legacy_xh_cno")
+>>> xh.get("C")
+1.089
+```
+
+## Notebook walkthroughs
+
+The repository ships example notebooks for the main workflows. In the
+documentation they are also available as rendered Markdown pages, so users can
+read them without opening Jupyter first.
+
+- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/)
+- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/)
+- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/)
+- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/)
+
+## Relationship to Delone Commons
+
+`atomref` is designed as a standalone package, but within Delone Commons it is
+primarily intended to support chemistry-aware packages such as:
+
+- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows,
+- future `chemvoro`, for chemistry-aware contact and hydrogen workflows.
+
+By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical
+packages and are not direct consumers of `atomref`.
+
+## Data curation and developer tools
+
+The repository also ships small maintenance tools. The most important ones are:
+
+- `python tools/check_registry.py` — validate curated registry metadata against
+ packaged CSV tables,
+- `python tools/check_notebooks.py` — execute notebook code cells,
+- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for
+ the docs,
+- `python tools/gen_readme.py` — regenerate `README.md` from this page,
+- `python tools/release_check.py` — run the full release-preparation checklist,
+ including linting, tests, docs, builds, and artifact validation.
+
+See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md)
+for a short description of each script.
+
+---
+
+This README is generated from `docs/index.md`.
+
+To regenerate it:
+
+```bash
+python tools/gen_readme.py
+```
+
+Edit the documentation sources instead of editing `README.md` directly.
diff --git a/docs/api/atomref.md b/docs/api/atomref.md
new file mode 100644
index 0000000..3536e34
--- /dev/null
+++ b/docs/api/atomref.md
@@ -0,0 +1,6 @@
+# atomref
+
+The top-level package re-exports the main user-facing API so that most code can
+simply do `import atomref as ar`.
+
+::: atomref
diff --git a/docs/api/elements.md b/docs/api/elements.md
new file mode 100644
index 0000000..2f066c7
--- /dev/null
+++ b/docs/api/elements.md
@@ -0,0 +1,7 @@
+# atomref.elements
+
+Element identity is intentionally minimal in the current implementation:
+atomic number, symbol, and name. The module also contains the canonicalization helpers used throughout the
+package.
+
+::: atomref.elements
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 0000000..f56eb7c
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,33 @@
+# API
+
+The public API is small on purpose.
+
+Most users will spend most of their time in the top-level package namespace and
+in the quantity-specific convenience helpers. The lower-level modules are still
+documented because they expose the actual data model behind the package.
+
+## Common tasks
+
+- get a single value: use `get_covalent_radius(...)`, `get_vdw_radius(...)`, or
+ `get_xh_bond_length(...)`
+- inspect provenance: use `lookup_covalent_radius(...)`,
+ `lookup_vdw_radius(...)`, `lookup_xh_bond_length(...)`, or the generic
+ `lookup_value(...)`
+- browse packaged datasets: use `list_quantities()`, `get_quantity_info(...)`,
+ `list_dataset_infos(...)`, `list_radii_set_infos(...)`, or
+ `list_xh_set_infos(...)`
+- load a packaged set directly: use `get_builtin_set(...)`, `get_radii_set(...)`,
+ or `get_xh_set(...)`
+- define a custom set: use `ElementScalarSet.from_mapping(...)`
+- define transfer-backed lookup behavior: use `ValuePolicy`, `RadiiPolicy`,
+ `XHPolicy`, `SubstitutionTransfer`, and `LinearTransfer`
+
+## Module reference
+
+- [Top-level package](atomref.md)
+- [Elements](elements.md)
+- [Registry and packaged datasets](registry.md)
+- [Transfer models](transfer.md)
+- [Generic policy core](policy.md)
+- [Radii API](radii.md)
+- [X–H API](xh.md)
diff --git a/docs/api/policy.md b/docs/api/policy.md
new file mode 100644
index 0000000..29b4142
--- /dev/null
+++ b/docs/api/policy.md
@@ -0,0 +1,26 @@
+# atomref.policy
+
+This module contains the generic resolver that sits below the radii-specific and
+X–H-specific convenience APIs.
+
+Use it when you want to work directly with the shared value-selection engine:
+
+- `ValuePolicy` — generic element-domain policy configuration,
+- `lookup_value(...)` — resolve one value together with provenance,
+- `get_value(...)` — resolve only the numeric value,
+- `LookupResult` — the structured result object returned by the resolver.
+
+A few practical notes:
+
+- The current runtime supports **element-domain** scalar policies.
+- `ValuePolicy` normalizes element-symbol overrides eagerly.
+- Transfer sources may be packaged datasets, custom sets, generic policies, or
+ wrapper policies that expose `as_value_policy()`.
+- `LookupResult.is_placeholder` refers to the returned numeric value itself, not
+ to whether any transfer happened.
+- `LookupResult.transfer_depth` counts how many transfer steps were involved in
+ the returned numeric value.
+- Nested lookup is cycle-checked across both generic `ValuePolicy` objects and
+ wrapper policies such as `RadiiPolicy` and `XHPolicy`.
+
+::: atomref.policy
diff --git a/docs/api/radii.md b/docs/api/radii.md
new file mode 100644
index 0000000..ff5e214
--- /dev/null
+++ b/docs/api/radii.md
@@ -0,0 +1,8 @@
+# atomref.radii
+
+This is the main user-facing module for radii workflows.
+
+It provides radii policies, packaged radii-set discovery, lookup helpers, and
+policy-assessment reports.
+
+::: atomref.radii
diff --git a/docs/api/registry.md b/docs/api/registry.md
new file mode 100644
index 0000000..9c41653
--- /dev/null
+++ b/docs/api/registry.md
@@ -0,0 +1,19 @@
+# atomref.registry
+
+This module contains the packaged data model.
+
+If you want to understand how `atomref` classifies datasets, how aliases are
+resolved, or how built-in CSV tables are turned into typed in-memory objects,
+this is the key module to read.
+
+The most important registry ideas are:
+
+- **quantity** — the operational property family,
+- **domain** — the key space used to index that quantity,
+- **dataset** — one curated named table inside the quantity.
+
+In the current runtime, the implemented lookup domain is `element`.
+The registry still stores `domain` explicitly because the metadata design is
+meant to stay reusable as the package grows.
+
+::: atomref.registry
diff --git a/docs/api/transfer.md b/docs/api/transfer.md
new file mode 100644
index 0000000..17e07ad
--- /dev/null
+++ b/docs/api/transfer.md
@@ -0,0 +1,39 @@
+# atomref.transfer
+
+Transfer models describe how missing target values may be restored from other
+sources.
+
+In the current runtime the built-in models are:
+
+- direct substitution (`SubstitutionTransfer`),
+- one-predictor linear transfer (`LinearTransfer`).
+
+A transfer source may be:
+
+- a packaged dataset reference,
+- a custom `ElementScalarSet`,
+- a generic `ValuePolicy`,
+- a wrapper policy that exposes `as_value_policy()`.
+
+`LinearTransfer` currently accepts exactly one predictor source at runtime, even
+though the public API stores predictors as a tuple for forward compatibility.
+
+For policy-backed linear predictors, `LinearTransfer` separates two questions:
+
+- which nested predictor values may be used to **fit** the linear model
+ (`fit_sources`, `fit_max_depth`), and
+- which nested predictor values may be used to **predict** the final requested
+ element (`prediction_sources`, `prediction_max_depth`).
+
+The defaults are intentionally conservative:
+
+- fit only on nested predictor values that came directly from `base` or
+ `override`,
+- but allow one additional nested transfer step when evaluating the predictor
+ for the requested element.
+
+That default is meant for workflows such as a sparse X–H target set correlated
+against a partial covalent-radii policy that is itself completed from a broader
+support set.
+
+::: atomref.transfer
diff --git a/docs/api/xh.md b/docs/api/xh.md
new file mode 100644
index 0000000..f96db27
--- /dev/null
+++ b/docs/api/xh.md
@@ -0,0 +1,24 @@
+# atomref.xh
+
+This module provides the provisional X–H bond-length helpers available in the
+current release line.
+
+It is intentionally narrow:
+
+- one packaged sparse target dataset, `csd_legacy_xh_cno`,
+- one wrapper policy, `XHPolicy`,
+- convenience helpers for listing packaged X–H sets and resolving X–H values.
+
+The built-in quantity is keyed by the **parent element `X`** in `X–H` and is
+currently aimed at hydrogen-position normalisation or related geometry
+workflows.
+
+In the default policy:
+
+- `C`, `N`, and `O` use curated ConQuest/CSD defaults,
+- other parent elements may be inferred from `cordero2008`,
+- policy-backed predictors are supported as well, with conservative nested-fit
+ defaults and one additional nested prediction step allowed by default,
+- fuller X–H literature support is planned for `0.2.x`.
+
+::: atomref.xh
diff --git a/docs/datasets/atomic_radius.md b/docs/datasets/atomic_radius.md
new file mode 100644
index 0000000..2852b3e
--- /dev/null
+++ b/docs/datasets/atomic_radius.md
@@ -0,0 +1,22 @@
+# Atomic radius
+
+The `atomic_radius` quantity exists to hold support datasets that are
+scientifically useful but should not be presented as direct condensed-phase vdW
+radii.
+
+## Rahm isodensity atomic radii (`rahm2016`)
+
+This is currently the only built-in atomic-radius dataset.
+
+- **What it is:** radii for isolated neutral atoms defined by the
+ ρ = 0.001 e/bohr³ electron-density isosurface.
+- **Source idea:** a consistent theory-based atomic size measure derived from
+ computed electron densities.
+- **Coverage:** broad, but not complete for the full periodic table.
+- **Why it matters here:** it correlates well with structural vdW radii and is a
+ useful support baseline when a condensed-phase target set is incomplete.
+- **How `atomref` uses it:** support-only dataset for linear transfer into
+ target vdW values such as `alvarez2013`.
+
+This is an important example of the package philosophy: a dataset can be very
+useful algorithmically without being mislabeled as something it is not.
diff --git a/docs/datasets/covalent_radius.md b/docs/datasets/covalent_radius.md
new file mode 100644
index 0000000..5e022fd
--- /dev/null
+++ b/docs/datasets/covalent_radius.md
@@ -0,0 +1,37 @@
+# Covalent radius
+
+The covalent-radius quantity is aimed at bond-detection and related geometry
+workflows. It currently ships one preferred target dataset and one
+legacy support dataset.
+
+## Cordero covalent radii (`cordero2008`)
+
+This is the main covalent-radius target set in the current release line.
+
+- **What it is:** a broad covalent-radius compilation based mainly on
+ crystallographic bond distances.
+- **Why it matters:** it is a modern, widely used reference set for element-wise
+ covalent radii.
+- **Coverage:** broad coverage across the periodic table, but not complete for
+ every element.
+- **How `atomref` uses it:** direct target dataset for covalent-radius lookup.
+
+If you want one covalent set to start with, this is usually the right first
+choice.
+
+## Legacy CSD covalent radii (`csd_legacy_cov`)
+
+This set reflects the older covalent radii historically used in CSD software for
+bond perception.
+
+- **What it is:** a practical, legacy-oriented bond-assignment table.
+- **Why it matters:** it has long been used in chemistry software and contains
+ placeholder conventions that are still relevant for compatibility work.
+- **Coverage:** broad practical coverage, with explicit placeholder values for
+ elements not covered by the historical table.
+- **How `atomref` uses it:** support dataset for substitution when the preferred
+ Cordero target set is missing a value.
+
+Because it contains legacy placeholders, it is not the preferred scientific
+starting point. It is mainly useful as a support layer and for compatibility
+with older workflows.
diff --git a/docs/datasets/index.md b/docs/datasets/index.md
new file mode 100644
index 0000000..d3b2951
--- /dev/null
+++ b/docs/datasets/index.md
@@ -0,0 +1,38 @@
+# Datasets
+
+`atomref` does not treat all datasets as interchangeable lookup tables.
+Instead, the package records several layers of classification:
+
+- **quantity** — the operational property being requested,
+- **domain** — the key space used to index that quantity,
+- **semantic class** — what the dataset scientifically represents,
+- **origin class** — how the values were obtained,
+- **phase context** — what physical context they describe,
+- **usage role** — whether the package treats the dataset as a direct target set
+ or as support data for transfer.
+
+This is what allows a dataset such as **Rahm isodensity atomic radii**
+(`rahm2016`) to be useful in van der Waals workflows without pretending that it
+is itself a condensed-phase structural vdW-radius set.
+
+## Programmatic inspection
+
+The most useful catalog helpers are:
+
+- `atomref.list_quantities()`
+- `atomref.get_quantity_info(...)`
+- `atomref.list_dataset_infos(...)`
+- `atomref.list_radii_set_infos(...)`
+- `atomref.list_xh_set_infos(...)`
+
+If you only need dataset ids, use `list_dataset_ids(...)`, `list_radii_sets(...)`,
+or `list_xh_sets(...)`.
+If you want the packaged values themselves, use `get_builtin_set(...)`,
+`get_radii_set(...)`, or `get_xh_set(...)`.
+
+## Built-in quantity families
+
+- [Covalent radius](covalent_radius.md)
+- [van der Waals radius](van_der_waals_radius.md)
+- [Atomic radius](atomic_radius.md)
+- [X–H bond length](xh_bond_length.md)
diff --git a/docs/datasets/van_der_waals_radius.md b/docs/datasets/van_der_waals_radius.md
new file mode 100644
index 0000000..3013d57
--- /dev/null
+++ b/docs/datasets/van_der_waals_radius.md
@@ -0,0 +1,57 @@
+# van der Waals radius
+
+The van der Waals quantity intentionally includes several target sets with
+different scientific backgrounds. This lets users choose between a classic
+historical compilation, structural contact-derived sets, and compatibility-only
+legacy tables.
+
+## Bondi van der Waals radii (`bondi1964`)
+
+A classic historical reference set compiled from mixed experimental sources.
+
+- **What it is:** the traditional Bondi vdW table used throughout chemistry.
+- **Coverage:** limited, especially for transition metals and heavier elements.
+- **Why you might use it:** historical consistency or comparison with older
+ literature and software defaults.
+
+## Rowland & Taylor nonbonded-contact radii (`rowland_taylor1996`)
+
+A small but influential structural set derived from organic-crystal nonbonded
+contacts.
+
+- **What it is:** a condensed-phase structural vdW set focused on common organic
+ elements.
+- **Coverage:** intentionally narrow.
+- **Why you might use it:** organic-crystal contact analysis and comparisons to
+ classic contact-distance literature.
+
+## Alvarez van der Waals radii (`alvarez2013`)
+
+This is the main van der Waals target set in the current release line.
+
+- **What it is:** a broad structural vdW set derived from statistical analysis
+ of many interatomic distances in the Cambridge Structural Database.
+- **Coverage:** broad, but still incomplete for some elements.
+- **Why you might use it:** it is a strong default for general condensed-phase
+ geometry and contact work.
+- **How `atomref` uses it:** direct target set for vdW lookup, with missing
+ values restored from support data when requested by policy.
+
+## Chernyshov line-of-sight vdW radii (`chernyshov2020`)
+
+A reduced element-wise view of a more atom-type-aware structural analysis.
+
+- **What it is:** vdW radii inferred from line-of-sight contact classification.
+- **Coverage:** focused on elements common in molecular crystals.
+- **Why you might use it:** you want a contact-derived set informed by the LoS
+ idea while still using a simple element-wise API.
+
+## Legacy CSD van der Waals radii (`csd_legacy_vdw`)
+
+A compatibility-oriented table used historically in CSD tools.
+
+- **What it is:** an older practical vdW table with placeholder conventions.
+- **Coverage:** broad practical coverage, but not a modern scientific target
+ set.
+- **How `atomref` uses it:** support-only data for legacy compatibility and
+ future migration work.
diff --git a/docs/datasets/xh_bond_length.md b/docs/datasets/xh_bond_length.md
new file mode 100644
index 0000000..28364c5
--- /dev/null
+++ b/docs/datasets/xh_bond_length.md
@@ -0,0 +1,39 @@
+# X–H bond length
+
+The `xh_bond_length` quantity is a small provisional addition in the current
+release line.
+
+Its purpose is not to claim a complete literature survey of X–H bond lengths.
+Instead, it provides a stable, provenance-aware starting point for
+hydrogen-normalisation workflows and related geometry code.
+
+## Packaged target dataset
+
+### CSD legacy X–H neutron-normalisation targets (`csd_legacy_xh_cno`)
+
+- **What it is:** the fixed `C–H`, `N–H`, and `O–H` target lengths used by
+ ConQuest for terminal-hydrogen normalisation.
+- **Coverage:** only parent elements `C`, `N`, and `O`.
+- **Values:** `C–H = 1.089 Å`, `N–H = 1.015 Å`, `O–H = 0.993 Å`.
+- **Primary provenance:** the ConQuest user guide section *Hydrogen Atom
+ Location in Crystal Structure Analyses*.
+- **Secondary provenance:** Allen & Bruno (2010), which the ConQuest guide cites
+ for these defaults.
+
+## How `atomref` uses it
+
+The built-in `DEFAULT_XH_POLICY` treats `csd_legacy_xh_cno` as a sparse target
+set and restores missing parent elements through a fitted linear transfer from
+`cordero2008` covalent radii.
+
+That means the package draws a sharp line between:
+
+- **curated dataset values** — currently only `C`, `N`, and `O`, and
+- **policy-generated values** — inferred for other parent elements when the
+ predictor policy can supply a covalent radius.
+
+## Scope note
+
+This is intentionally a small addendum rather than full X–H support.
+Broader X–H datasets, richer policies, and more complete literature treatment
+are planned for `0.2.x`.
diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md
new file mode 100644
index 0000000..680b755
--- /dev/null
+++ b/docs/dev/architecture.md
@@ -0,0 +1,109 @@
+# Architecture
+
+Publicly, `atomref` is still radii-first, with a small provisional X–H helper.
+
+Internally, the package is built around four layers:
+
+1. **elements** — stable element metadata and symbol canonicalization,
+2. **registry** — curated quantity and dataset metadata plus packaged data
+ loading,
+3. **policy core** — generic value selection with overrides, transfers,
+ fallbacks, blocked keys, and provenance,
+4. **quantity wrappers** — convenience APIs such as `atomref.radii` and
+ `atomref.xh`.
+
+## Core terminology
+
+A few terms are deliberately separated in the design:
+
+- **quantity** — the operational property family being requested,
+- **domain** — the key space used to index that quantity,
+- **dataset** — one curated source table inside the quantity,
+- **policy** — the ordered rule set used to select a final value.
+
+This separation is what allows the package to say, for example, that
+`rahm2016` belongs to the `atomic_radius` quantity but can still act as support
+data in a van der Waals policy.
+
+## Domain support in the current runtime
+
+The registry schema is domain-aware, but the current resolver intentionally
+implements only one domain:
+
+- `element`
+
+That means:
+
+- packaged built-in sets are currently element-indexed scalar tables,
+- `ValuePolicy` resolves element symbols,
+- transfer fitting is performed over element-wise overlap.
+
+The metadata keeps `domain` explicit now so later versions can extend the data
+model without having to reinterpret existing registry entries.
+
+## Policy resolution and transfer sources
+
+The generic resolver works in a fixed order:
+
+1. blocked keys,
+2. overrides,
+3. base dataset,
+4. transfer models,
+5. fallback,
+6. missing.
+
+Transfer sources can be:
+
+- packaged datasets,
+- custom `ElementScalarSet` objects,
+- generic `ValuePolicy` objects,
+- wrapper policies exposing `as_value_policy()`.
+
+That last point is important. It means higher-level code can express
+"infer values from my chosen covalent-radii policy" instead of being forced to
+refer to one hard-coded predictor dataset.
+
+## Nested-policy safeguards and cycle detection
+
+Policy-backed transfer sources are materialized with more than just raw numeric
+values. The resolver also tracks, per element:
+
+- whether the value came from `base`, `override`, substitution, linear transfer,
+ or fallback,
+- the nested transfer depth that was required to produce it,
+- placeholder status.
+
+`LinearTransfer` uses that information twice:
+
+- once when fitting the linear relation (`fit_sources` / `fit_max_depth`),
+- again when deciding whether the predictor value for the requested element is
+ admissible (`prediction_sources` / `prediction_max_depth`).
+
+The default policy is intentionally conservative: fit only on direct nested
+predictor values, but allow one additional nested completion step when
+predicting the final requested element. This keeps the common two-stage use case
+possible without silently training on arbitrarily long inference chains.
+
+Cycle detection is handled with a context-local activation stack. Both generic
+`ValuePolicy` objects and wrapper policies are tracked, so recursion through a
+freshly materialized wrapper policy is still detected reliably and safely.
+
+## Placeholder handling
+
+Placeholder semantics stay attached to the value that was actually returned.
+This means `LookupResult.is_placeholder` can be true for:
+
+- a base lookup,
+- a substitution transfer,
+- a nested policy used as a transfer source.
+
+A linear transfer normally returns a computed value and therefore does not carry
+placeholder status itself. Instead, its provenance is carried by
+`resolved_from`, explanatory notes, and `transfer_depth`.
+
+## Why the design stays small
+
+The package deliberately avoids a large object graph or a chemistry-specific DSL.
+A quantity wrapper is usually only a thin adapter over the generic policy core.
+That keeps the internals easy to test and lets other scientific packages reuse
+`atomref` without bringing in the rest of the Delone Commons stack.
diff --git a/docs/dev/data_curation.md b/docs/dev/data_curation.md
new file mode 100644
index 0000000..689ae24
--- /dev/null
+++ b/docs/dev/data_curation.md
@@ -0,0 +1,26 @@
+# Data curation
+
+Packaged tables are stored as CSV files indexed by atomic number. Dataset
+metadata and provenance live in `src/atomref/data/registry.json`.
+
+Placeholder values are modeled as dataset metadata, not as hard-coded Python
+constants.
+
+The registry distinguishes several orthogonal concerns:
+
+- `quantity` — the operational lookup target, such as `covalent_radius` or
+ `van_der_waals_radius`
+- `semantic_class` — what the dataset scientifically represents
+- `usage_role` — whether the dataset is intended as a direct target set or as
+ support data for transfer
+- `phase_context` — the physical context of the underlying values
+
+This matters for support-only datasets such as `atomic_radius:rahm2016`, which
+is packaged as atomic support data and then used by the default van der Waals
+policy through linear transfer.
+
+To check that metadata and packaged tables stay synchronized, run:
+
+```bash
+python tools/check_registry.py
+```
diff --git a/docs/dev/dev_plan.md b/docs/dev/dev_plan.md
new file mode 100644
index 0000000..94cdaac
--- /dev/null
+++ b/docs/dev/dev_plan.md
@@ -0,0 +1,33 @@
+# Development plan
+
+## Current status (implemented in the `0.1.x` line)
+
+- stable element metadata
+- curated covalent, van der Waals, and atomic-radius support datasets
+- explicit provenance and coverage metadata
+- generic value-policy core plus radii and X–H convenience wrappers
+- substitution and linear transfer
+- custom element-indexed scalar sets
+- policy-backed transfer sources
+- nested-policy safeguards, transfer-depth tracking, and cycle detection
+- provisional X–H support via `csd_legacy_xh_cno`, `XHPolicy`, and
+ `DEFAULT_XH_POLICY`
+
+## Planned for `0.2.x`
+
+- broader X–H datasets and policies
+- experimental plus computational support sets
+- pairwise helper logic such as reference sums and normalization schemes
+- restoration of incomplete experimental data from broader-support predictors
+
+## Longer-term design ideas
+
+- radial atomic reference functions
+- simple proto-density support based on spherically averaged atomic data
+
+## Possible future directions
+
+- more radii sets
+- uncertainty and confidence flags
+- ion-specific or atom-type-specific domains
+- density-derived radii and related reference transforms
diff --git a/docs/guide/custom_sets.md b/docs/guide/custom_sets.md
new file mode 100644
index 0000000..71306bb
--- /dev/null
+++ b/docs/guide/custom_sets.md
@@ -0,0 +1,31 @@
+# Custom sets
+
+`atomref` is not limited to the packaged tables. You can build a small
+user-defined element-indexed scalar dataset and use it as a base dataset or as a
+support dataset inside a transfer-backed policy.
+
+The simplest entry point is `ElementScalarSet.from_mapping(...)`.
+
+```python
+from atomref import DatasetRef, ElementScalarSet, RadiiPolicy
+
+custom = ElementScalarSet.from_mapping(
+ ref=DatasetRef("covalent_radius", "my_cov"),
+ values={"C": 0.75, "H": 0.31},
+ name="My custom covalent radii",
+ units="angstrom",
+)
+
+policy = RadiiPolicy(kind="covalent", base_set=custom)
+```
+
+This is useful when you want to:
+
+- test an alternative reference table,
+- pin a small project-specific dataset without creating a full package fork,
+- combine a user dataset with built-in support data through substitution or
+ linear transfer.
+
+In the current implementation custom sets are element-domain scalar datasets,
+which keeps the data model small and stable. Later versions may add more specialized domains, but
+custom element-wise sets are already enough for many geometry workflows.
diff --git a/docs/guide/install.md b/docs/guide/install.md
new file mode 100644
index 0000000..e7e0697
--- /dev/null
+++ b/docs/guide/install.md
@@ -0,0 +1,30 @@
+# Install
+
+For normal use, install the runtime package:
+
+```bash
+pip install atomref
+```
+
+`atomref` is pure Python and has no required runtime dependencies outside the
+standard library.
+
+For local development, documentation work, and tests, install the editable
+package together with the main extras:
+
+```bash
+pip install -e ".[test,docs,dev]"
+```
+
+Those extras currently cover:
+
+- `test` — pytest and test-only compatibility helpers,
+- `docs` — MkDocs and API documentation tooling,
+- `dev` — flake8, build, and release metadata checks.
+
+
+For a full local pre-release validation pass after installing those extras, run:
+
+```bash
+python tools/release_check.py
+```
diff --git a/docs/guide/non_goals.md b/docs/guide/non_goals.md
new file mode 100644
index 0000000..b38aa68
--- /dev/null
+++ b/docs/guide/non_goals.md
@@ -0,0 +1,23 @@
+# Non-goals
+
+`atomref` is intentionally narrow.
+
+It is **not** trying to be:
+
+- a general periodic-table encyclopedia,
+- a home for arbitrary atomic or chemical properties,
+- a structure parser,
+- a crystallographic symmetry package,
+- a structure-inference engine,
+- a Voronoi / tessellation library,
+- an environment-specific chemistry model,
+- a machine-learning framework for extrapolating unseen chemistry.
+
+The package is about **curated reference data and explicit lookup policies**.
+That includes provenance, transfer from broader support datasets, and stable API
+surfaces that higher-level scientific code can rely on.
+
+Future versions may widen the range of supported *reference-data families* — for
+example X–H distances or radial atomic reference functions — but the package
+should still remain a small reference-data layer rather than a full chemistry
+platform.
diff --git a/docs/guide/notebooks.md b/docs/guide/notebooks.md
new file mode 100644
index 0000000..2ad0045
--- /dev/null
+++ b/docs/guide/notebooks.md
@@ -0,0 +1,25 @@
+# Notebook gallery
+
+`atomref` ships example Jupyter notebooks that cover the main workflows.
+Each notebook is available in two forms:
+
+- the original `.ipynb` file in the repository,
+- a rendered Markdown copy included in these docs.
+
+That way users can either run the notebooks locally or read them directly on the
+documentation site.
+
+## Available notebooks
+
+- [Quickstart notebook](../notebooks/01-quickstart.md) — basic imports,
+ `get_*` vs `lookup_*`, quantity discovery, and packaged-set access.
+- [Policies and assessment notebook](../notebooks/02-policies-and-assessment.md)
+ — overrides, transfer-backed policies, and policy summaries.
+- [Custom sets and discovery notebook](../notebooks/03-custom-sets-and-discovery.md)
+ — user-defined sets, catalog inspection, and metadata exploration.
+
+The original notebook files are also in the repository:
+
+- [`01-quickstart.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb)
+- [`02-policies-and-assessment.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb)
+- [`03-custom-sets-and-discovery.ipynb`](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb)
diff --git a/docs/guide/policies.md b/docs/guide/policies.md
new file mode 100644
index 0000000..b9e3b7a
--- /dev/null
+++ b/docs/guide/policies.md
@@ -0,0 +1,222 @@
+# Policies
+
+A policy tells `atomref` how to answer the question “what value should I use for
+this element?”
+
+That may sound simple, but in practice scientific datasets are often
+incomplete. A policy makes the decision process explicit instead of hiding it in
+algorithm code.
+
+## Terms used in the policy layer
+
+A few terms appear repeatedly in the API and docs:
+
+- **quantity** — the operational property family being requested.
+- **domain** — the lookup key space. In the current runtime that means
+ `element`, so lookups are keyed by element symbol.
+- **dataset** — a curated named table inside one quantity.
+- **policy** — the ordered rule set used to resolve missing values.
+
+The quantity and dataset live in the curated registry. The policy is the
+selection logic that sits on top of them.
+
+## Resolution order
+
+In the current implementation every lookup follows the same ordered path:
+
+1. **Blocked key** (optional)
+2. **Override**
+3. **Base dataset**
+4. **Transfer models**, in the order you listed them
+5. **Fallback**
+6. **Missing**
+
+Each step has a specific meaning.
+
+### Blocked key
+
+Some quantity wrappers need to declare that certain domain keys should never be
+resolved, even if a transfer model could otherwise invent a number. The current
+X–H helper uses this for `H`, because `xh_bond_length` is keyed by the parent
+atom `X` in `X–H`, not by hydrogen itself.
+
+### Override
+
+An override is a value you provide directly for a specific element. It wins over
+everything else and is useful when you want to pin one or two elements without
+changing the whole dataset.
+
+### Base dataset
+
+The base dataset is the preferred source. For example, the default covalent
+policy starts from the **Cordero covalent radii** (`cordero2008`), and the
+default vdW policy starts from the **Alvarez van der Waals radii**
+(`alvarez2013`).
+
+### Transfer
+
+A transfer model is used only when the base dataset has no value for the
+requested element.
+
+Built-in transfer models are:
+
+- `SubstitutionTransfer` — take a value directly from another dataset or policy,
+- `LinearTransfer` — infer a target-equivalent value from another dataset or
+ policy through a fitted linear model.
+
+`LinearTransfer` already accepts a tuple of predictors in the API, but the
+current runtime intentionally supports exactly one predictor source. That keeps
+the implementation simple now while leaving room for later multi-predictor
+linear models.
+
+Transfer sources can be:
+
+- a packaged dataset reference (`DatasetRef`),
+- a custom `ElementScalarSet`,
+- a generic `ValuePolicy`,
+- a wrapper policy such as `RadiiPolicy` or `XHPolicy`.
+
+When a transfer source is itself a policy, `atomref` uses the values selected by
+that policy. This lets higher-level workflows express things like “infer X–H
+lengths from my chosen covalent-radii policy” instead of hard-coding a specific
+support dataset.
+
+#### Nested policy safeguards for `LinearTransfer`
+
+When a predictor source is itself a policy, two different questions matter:
+
+1. Which nested predictor values are trustworthy enough to train the linear fit?
+2. Which nested predictor value is acceptable for the final requested element?
+
+`atomref` keeps those two decisions separate. By default:
+
+- `fit_sources=("base", "override")` and `fit_max_depth=0`,
+- `prediction_sources=("base", "override", "transfer_substitution", "transfer_linear")`
+ and `prediction_max_depth=1`.
+
+That means the fitted relationship is trained only on direct predictor values by
+default, while one additional nested completion step is still allowed at
+prediction time.
+
+This is a good default for workflows such as:
+
+- sparse target X–H data from `csd_legacy_xh_cno`,
+- a partial covalent-radii predictor policy with direct `s,p` values,
+- one inner transfer from a broader support set such as `cordero2008` to make
+ the predictor usable for `d` or `f` elements.
+
+In that setup, the outer X–H fit still uses direct predictor anchors, while the
+final requested element may use one nested predictor transfer. If you really do
+want fit training to use nested predictor values as well, you can opt in
+explicitly by widening `fit_sources` and/or increasing `fit_max_depth`.
+
+### Fallback
+
+A fallback is a constant last-resort value. It is useful when an algorithm must
+receive *some* number even if both the base dataset and transfer sources are
+missing a value.
+
+### Missing
+
+If nothing above can produce a value and no fallback was configured, the result
+is simply missing. In that case `get_*` returns `None`, while `lookup_*`
+returns a `LookupResult` with `source="missing"` and explanatory notes.
+
+## Placeholder values and `is_placeholder`
+
+Some support datasets use placeholder numbers to stand in for “unknown but keep
+this legacy table dense enough for downstream heuristics”.
+
+`LookupResult.is_placeholder` answers one narrow question:
+
+> Is the **returned numeric value itself** marked as a placeholder by the source
+> that supplied it?
+
+It does **not** mean “a transfer happened”. Examples:
+
+- a base lookup can have `is_placeholder=True` if the base dataset contains a
+ placeholder value,
+- a substitution transfer can also have `is_placeholder=True` if it copied a
+ placeholder from the transfer source,
+- a linear transfer is computed, not copied, so `is_placeholder` is normally
+ `False`.
+
+## Transfer depth and cycle detection
+
+`LookupResult.transfer_depth` counts how many transfer steps were needed to
+produce the returned value:
+
+- direct base and override values have depth `0`,
+- one substitution or linear restoration has depth `1`,
+- nested transfer chains increase the depth further.
+
+This makes nested-policy behavior inspectable without trying to infer it from
+notes alone.
+
+Because policies may now depend on other policies, the resolver also performs
+cycle detection. A cyclic reference such as policy A depending on policy B while
+policy B depends back on policy A raises `PolicyError` instead of recurring
+indefinitely. The same protection applies when recursion goes through wrapper
+policies such as `RadiiPolicy` or `XHPolicy`.
+
+## Target datasets and support datasets
+
+`atomref` separates **what a dataset is used for** from **what it scientifically
+represents**.
+
+That is why the package stores:
+
+- the operational **quantity**,
+- the lookup **domain**,
+- the scientific **semantic class**,
+- the package-level **usage role**.
+
+This distinction matters for datasets such as **Rahm isodensity atomic radii**
+(`rahm2016`). They are useful support data for restoring missing van der Waals
+radii, but they are not the same thing as a condensed-phase structural vdW
+radius set. In `atomref`, that difference is recorded in the metadata instead of
+being hidden.
+
+## Examples
+
+A standard dataset-backed transfer:
+
+```python
+import atomref as ar
+
+policy = ar.RadiiPolicy(
+ kind="van_der_waals",
+ base_set="alvarez2013",
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef("atomic_radius", "rahm2016"),),
+ ),
+ ),
+ overrides={"Xe": 2.10},
+)
+```
+
+A policy-backed transfer source:
+
+```python
+import atomref as ar
+
+xh_policy = ar.XHPolicy(
+ base_set="csd_legacy_xh_cno",
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DEFAULT_COVALENT_POLICY,),
+ min_points=3,
+ ),
+ ),
+)
+```
+
+With that X–H policy:
+
+- `C`, `N`, and `O` use the curated ConQuest defaults,
+- missing parent elements may be inferred from the **selected covalent-radii
+ policy**, not just from one hard-coded support dataset,
+- if the predictor policy itself needed a transfer to produce a covalent radius,
+ the resulting `LookupResult` still records that provenance in `resolved_from`,
+ `notes`, and `transfer_depth`.
diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md
new file mode 100644
index 0000000..72e6858
--- /dev/null
+++ b/docs/guide/quickstart.md
@@ -0,0 +1,61 @@
+# Quickstart
+
+The two most important user-facing ideas in `atomref` are:
+
+- `get_*` returns only the selected number,
+- `lookup_*` returns the number **and** provenance metadata.
+
+```pycon
+>>> import atomref as ar
+>>> ar.get_covalent_radius("C")
+0.76
+>>> ar.get_vdw_radius("O")
+1.5
+>>> ar.get_xh_bond_length("N")
+1.015
+>>> lookup = ar.lookup_vdw_radius("Pm")
+>>> lookup.value
+2.8972265395148358
+>>> lookup.source
+'transfer_linear'
+>>> lookup.resolved_from
+(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),)
+```
+
+Use `get_*` when you only need the value. Use `lookup_*` when you want to know
+whether the result came from the preferred dataset, a support dataset, a policy
+override, or a fallback.
+
+You can inspect the packaged quantity layer directly:
+
+```pycon
+>>> import atomref as ar
+>>> ar.list_quantities()
+('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length')
+>>> ar.get_quantity_info("xh_bond_length")
+QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.')
+>>> [info.ref.set_id for info in ar.list_radii_set_infos("van_der_waals", usage_role="target")]
+['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020']
+```
+
+And you can load a packaged set object directly:
+
+```pycon
+>>> import atomref as ar
+>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013")
+>>> vdw.get("O")
+1.5
+>>> raw = ar.get_builtin_set(ar.DatasetRef("atomic_radius", "rahm2016"))
+>>> raw.get("Pm")
+2.83
+>>> xh = ar.get_xh_set("csd_legacy_xh_cno")
+>>> xh.get("C")
+1.089
+```
+
+For longer, runnable examples see:
+
+- the [notebook overview](notebooks.md),
+- the [quickstart notebook page](../notebooks/01-quickstart.md),
+- the [policies notebook page](../notebooks/02-policies-and-assessment.md),
+- the [custom sets notebook page](../notebooks/03-custom-sets-and-discovery.md).
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..198fa6a
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,161 @@
+# atomref
+
+[](https://github.com/DeloneCommons/atomref/actions/workflows/ci.yml)
+[](https://github.com/DeloneCommons/atomref/actions/workflows/docs.yml)
+[](https://pypi.org/project/atomref/)
+[](https://pypi.org/project/atomref/)
+[](https://github.com/DeloneCommons/atomref/blob/main/LICENSE)
+
+`atomref` is a small pure-Python package for **curated atomic reference data**
+and **provenance-aware lookup policies** used by geometry and
+structure-analysis algorithms.
+
+It is not meant to be yet another periodic-table encyclopedia. The package is
+for code that needs stable atomic reference values with explicit provenance,
+clear fallback behavior, and honest handling of incomplete preferred datasets.
+
+What you get in the current release line:
+
+- stable element metadata,
+- curated named radii sets,
+- provisional X–H bond-length support for hydrogen-normalisation workflows,
+- dataset provenance and coverage metadata,
+- deterministic lookup policies,
+- substitution and linear transfer from support datasets or policies into target datasets,
+- guarded nested policy-backed transfers with explicit transfer depth,
+ conservative fit/prediction controls, and cycle detection,
+- user-defined custom element-indexed scalar sets.
+
+## Core terms
+
+`atomref` uses a small vocabulary on purpose.
+
+- **quantity** — the operational property family being requested, such as
+ `covalent_radius`, `van_der_waals_radius`, `atomic_radius`, or
+ `xh_bond_length`.
+- **domain** — the key space used to index that quantity. In the current
+ runtime, the supported domain is `element`, meaning lookups are keyed by an
+ element symbol.
+- **dataset** — one curated named table inside a quantity, such as
+ `cordero2008`, `alvarez2013`, or `csd_legacy_xh_cno`.
+- **policy** — the ordered rule set that decides what value to return when the
+ preferred dataset is incomplete.
+
+The metadata layer already records `domain` explicitly because the package is
+built for later extension, but the current runtime intentionally keeps the
+implementation narrow and stable: **the current runtime resolves only
+element-domain scalar values**.
+
+## Why this exists
+
+Scientific software often wants a complete lookup table, but the best dataset
+for the job is rarely complete. `atomref` makes that situation explicit.
+Instead of hiding ad hoc defaults inside algorithm code, you choose a target
+set, describe how missing values may be restored, and keep provenance on what
+was actually returned.
+
+The built-in default behavior is intentionally simple and practical:
+
+- **Cordero covalent radii** (`cordero2008`) are the preferred covalent target
+ set, with missing values substituted from the **legacy CSD covalent radii**
+ (`csd_legacy_cov`).
+- **Alvarez van der Waals radii** (`alvarez2013`) are the preferred vdW target
+ set, with missing values restored from the **Rahm isodensity atomic radii**
+ (`rahm2016`) through a fitted linear transfer.
+- **CSD/ConQuest hydrogen-normalisation defaults** (`csd_legacy_xh_cno`) are a
+ provisional sparse X–H target set for `C`, `N`, and `O`, with other parent
+ elements inferred from **Cordero covalent radii** through a fitted linear
+ transfer.
+
+Nested policy predictors are supported too. `LinearTransfer` separates
+**fit-time** use of nested predictor values from **prediction-time** use. By default, the fit may use only direct nested
+values, while the final requested element may still use one additional
+nested completion step. That is a useful compromise for workflows such as
+provisional X–H inference from a chosen covalent-radii policy.
+
+## Quick example
+
+```pycon
+>>> import atomref as ar
+>>> ar.get_covalent_radius("C")
+0.76
+>>> ar.get_vdw_radius("O")
+1.5
+>>> ar.get_xh_bond_length("N")
+1.015
+>>> lookup = ar.lookup_vdw_radius("Pm")
+>>> lookup.value
+2.8972265395148358
+>>> lookup.source
+'transfer_linear'
+>>> lookup.transfer_depth
+1
+>>> lookup.resolved_from
+(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),)
+```
+
+`get_*` returns only the number. `lookup_*` returns a `LookupResult` that also
+records where the value came from, whether a transfer model or policy source was
+involved, and how many transfer steps were needed (`transfer_depth`).
+
+You can inspect the packaged quantity and dataset catalog directly:
+
+```pycon
+>>> import atomref as ar
+>>> ar.list_quantities()
+('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length')
+>>> ar.get_quantity_info("xh_bond_length")
+QuantityInfo(quantity='xh_bond_length', domain='element', units='angstrom', description='Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows.')
+>>> [info.ref.set_id for info in ar.list_dataset_infos("van_der_waals_radius", usage_role="target")]
+['bondi1964', 'rowland_taylor1996', 'alvarez2013', 'chernyshov2020']
+```
+
+You can also load a packaged set directly:
+
+```pycon
+>>> import atomref as ar
+>>> vdw = ar.get_radii_set("van_der_waals", "alvarez2013")
+>>> vdw.get("O")
+1.5
+>>> xh = ar.get_xh_set("csd_legacy_xh_cno")
+>>> xh.get("C")
+1.089
+```
+
+## Notebook walkthroughs
+
+The repository ships example notebooks for the main workflows. In the
+documentation they are also available as rendered Markdown pages, so users can
+read them without opening Jupyter first.
+
+- [Notebook overview](https://delonecommons.github.io/atomref/guide/notebooks/)
+- [Quickstart notebook](https://delonecommons.github.io/atomref/notebooks/01-quickstart/)
+- [Policies and assessment notebook](https://delonecommons.github.io/atomref/notebooks/02-policies-and-assessment/)
+- [Custom sets and discovery notebook](https://delonecommons.github.io/atomref/notebooks/03-custom-sets-and-discovery/)
+
+## Relationship to Delone Commons
+
+`atomref` is designed as a standalone package, but within Delone Commons it is
+primarily intended to support chemistry-aware packages such as:
+
+- `molcryst`, for covalent-bond detection, contact analysis, and hydrogen workflows,
+- future `chemvoro`, for chemistry-aware contact and hydrogen workflows.
+
+By contrast, `pyvoro2` and `pbcgraph` are intentionally general mathematical
+packages and are not direct consumers of `atomref`.
+
+## Data curation and developer tools
+
+The repository also ships small maintenance tools. The most important ones are:
+
+- `python tools/check_registry.py` — validate curated registry metadata against
+ packaged CSV tables,
+- `python tools/check_notebooks.py` — execute notebook code cells,
+- `python tools/export_notebooks.py` — turn notebooks into Markdown pages for
+ the docs,
+- `python tools/gen_readme.py` — regenerate `README.md` from this page,
+- `python tools/release_check.py` — run the full release-preparation checklist,
+ including linting, tests, docs, builds, and artifact validation.
+
+See the [tools README](https://github.com/DeloneCommons/atomref/blob/main/tools/README.md)
+for a short description of each script.
diff --git a/docs/notebooks/01-quickstart.md b/docs/notebooks/01-quickstart.md
new file mode 100644
index 0000000..12e8813
--- /dev/null
+++ b/docs/notebooks/01-quickstart.md
@@ -0,0 +1,72 @@
+
+
+[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/01-quickstart.ipynb)
+# atomref quickstart
+
+This notebook covers the main public API: element helpers, direct
+`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset
+discovery.
+```python
+import atomref as ar
+
+print(ar.get_element('Cl'))
+print(ar.list_quantities())
+```
+**Output**
+```text
+Element(z=17, symbol='Cl', name='Chlorine')
+('covalent_radius', 'van_der_waals_radius', 'atomic_radius', 'xh_bond_length')
+```
+```python
+r_c = ar.get_covalent_radius('C')
+r_vdw = ar.get_vdw_radius('O')
+print(r_c)
+print(r_vdw)
+assert r_c == 0.76
+assert r_vdw == 1.50
+```
+**Output**
+```text
+0.76
+1.5
+```
+```python
+lookup = ar.lookup_vdw_radius('Pm')
+print(f"{lookup.value:.12f}")
+print(lookup.source)
+print(lookup.resolved_from)
+assert lookup.source == 'transfer_linear'
+```
+**Output**
+```text
+2.897226539515
+transfer_linear
+(DatasetRef(quantity='atomic_radius', set_id='rahm2016'),)
+```
+```python
+quantity = ar.get_quantity_info('atomic_radius')
+print(quantity.quantity, quantity.domain, quantity.units)
+
+for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'):
+ print(info.ref.set_id, info.name, info.usage_role)
+```
+**Output**
+```text
+atomic_radius element angstrom
+bondi1964 Bondi van der Waals radii target
+rowland_taylor1996 Rowland & Taylor nonbonded contact radii target
+alvarez2013 Alvarez van der Waals radii target
+chernyshov2020 Chernyshov LoS van der Waals radii target
+```
+```python
+vdw = ar.get_radii_set('van_der_waals', 'alvarez2013')
+print(vdw.get('O'))
+
+support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016'))
+print(support.get('Pm'))
+```
+**Output**
+```text
+1.5
+2.83
+```
diff --git a/docs/notebooks/02-policies-and-assessment.md b/docs/notebooks/02-policies-and-assessment.md
new file mode 100644
index 0000000..4f6baf6
--- /dev/null
+++ b/docs/notebooks/02-policies-and-assessment.md
@@ -0,0 +1,73 @@
+
+
+[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/02-policies-and-assessment.ipynb)
+# Policies and assessment
+
+This notebook shows how `atomref` resolves missing values through ordered
+policy steps and how to inspect policy-level behavior.
+```python
+import atomref as ar
+```
+```python
+covalent_policy = ar.RadiiPolicy(
+ kind='covalent',
+ base_set='cordero2008',
+ transfers=(
+ ar.SubstitutionTransfer(
+ source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov')
+ ),
+ ),
+)
+lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy)
+print(lookup.source)
+print(f"{lookup.value:.12f}")
+print(lookup.resolved_from)
+```
+**Output**
+```text
+transfer_substitution
+1.540000000000
+(DatasetRef(quantity='covalent_radius', set_id='csd_legacy_cov'),)
+```
+```python
+vdw_policy = ar.RadiiPolicy(
+ kind='van_der_waals',
+ base_set='alvarez2013',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),)
+ ),
+ ),
+)
+lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy)
+print(f"{lookup.value:.12f}")
+print(lookup.source)
+print(
+ f"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}"
+)
+```
+**Output**
+```text
+2.897226539515
+transfer_linear
+slope=1.135336645553 intercept=-0.315776167399 n=90
+```
+```python
+assessment = ar.assess_radii_policy(
+ ['C', 'Xe', 'Pm', 'Bk'],
+ policy=vdw_policy,
+ detail=True,
+)
+print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing)
+for row in assessment.per_element:
+ value = 'None' if row.lookup.value is None else f"{row.lookup.value:.12f}"
+ print(row.symbol, row.lookup.source, value)
+```
+**Output**
+```text
+3 1 0
+C base 1.770000000000
+Xe base 2.060000000000
+Pm transfer_linear 2.897226539515
+Bk base 3.400000000000
+```
diff --git a/docs/notebooks/03-custom-sets-and-discovery.md b/docs/notebooks/03-custom-sets-and-discovery.md
new file mode 100644
index 0000000..47138bf
--- /dev/null
+++ b/docs/notebooks/03-custom-sets-and-discovery.md
@@ -0,0 +1,56 @@
+
+
+[Open the original notebook on GitHub](https://github.com/DeloneCommons/atomref/blob/main/notebooks/03-custom-sets-and-discovery.ipynb)
+# Custom sets and dataset discovery
+
+This notebook shows how to define a small user-provided set, plug it into a
+policy, and inspect the packaged dataset catalog.
+```python
+import atomref as ar
+```
+```python
+custom_cov = ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef("covalent_radius", "demo_user_cov"),
+ values={"C": 0.77, "O": 0.67},
+ name="Demo user covalent set",
+ units="angstrom",
+ description="Example custom set for notebook usage.",
+ notes=("Notebook example",),
+)
+
+policy = ar.RadiiPolicy(
+ kind="covalent",
+ base_set=custom_cov,
+ transfers=(
+ ar.SubstitutionTransfer(
+ source=ar.DatasetRef("covalent_radius", "cordero2008")
+ ),
+ ),
+)
+
+for symbol in ("C", "O", "N"):
+ print(symbol, ar.lookup_covalent_radius(symbol, policy=policy))
+```
+**Output**
+```text
+C LookupResult(value=0.77, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0)
+O LookupResult(value=0.67, source='base', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'),), is_placeholder=False, fit=None, notes=(), transfer_depth=0)
+N LookupResult(value=0.71, source='transfer_substitution', target=DatasetRef(quantity='covalent_radius', set_id='demo_user_cov'), resolved_from=(DatasetRef(quantity='covalent_radius', set_id='cordero2008'),), is_placeholder=False, fit=None, notes=('missing in base set; substituted from transfer source',), transfer_depth=1)
+```
+```python
+for info in ar.list_radii_set_infos("van_der_waals", usage_role="target"):
+ print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context)
+
+rahm = ar.get_dataset_info(ar.DatasetRef("atomic_radius", "rahm2016"))
+print(rahm.name)
+print(rahm.semantic_class, rahm.phase_context, rahm.usage_role)
+```
+**Output**
+```text
+bondi1964 vdw_compiled compiled_experimental mixed_or_legacy
+rowland_taylor1996 vdw_structural structural condensed_phase
+alvarez2013 vdw_structural structural condensed_phase
+chernyshov2020 vdw_structural_typed_reduced structural condensed_phase
+Rahm isodensity atomic radii (ρ=0.001 e/bohr³)
+atomic_isodensity isolated_atom support
+```
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..e0952f2
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,50 @@
+site_name: atomref
+site_url: https://delonecommons.github.io/atomref/
+repo_url: https://github.com/DeloneCommons/atomref
+repo_name: DeloneCommons/atomref
+
+theme:
+ name: material
+
+plugins:
+ - search
+ - mkdocstrings:
+ handlers:
+ python:
+ paths: [src]
+ options:
+ show_root_heading: true
+ show_source: false
+
+nav:
+ - Home: index.md
+ - Guide:
+ - Install: guide/install.md
+ - Quickstart: guide/quickstart.md
+ - Policies: guide/policies.md
+ - Custom sets: guide/custom_sets.md
+ - Non-goals: guide/non_goals.md
+ - Datasets:
+ - Overview: datasets/index.md
+ - Covalent radius: datasets/covalent_radius.md
+ - van der Waals radius: datasets/van_der_waals_radius.md
+ - Atomic radius: datasets/atomic_radius.md
+ - X–H bond length: datasets/xh_bond_length.md
+ - Notebooks:
+ - Overview: guide/notebooks.md
+ - Quickstart notebook: notebooks/01-quickstart.md
+ - Policies and assessment notebook: notebooks/02-policies-and-assessment.md
+ - Custom sets and discovery notebook: notebooks/03-custom-sets-and-discovery.md
+ - Development:
+ - Architecture: dev/architecture.md
+ - Data curation: dev/data_curation.md
+ - Development plan: dev/dev_plan.md
+ - API:
+ - Overview: api/index.md
+ - atomref: api/atomref.md
+ - atomref.elements: api/elements.md
+ - atomref.registry: api/registry.md
+ - atomref.transfer: api/transfer.md
+ - atomref.policy: api/policy.md
+ - atomref.radii: api/radii.md
+ - atomref.xh: api/xh.md
diff --git a/notebooks/01-quickstart.ipynb b/notebooks/01-quickstart.ipynb
new file mode 100644
index 0000000..47b58d1
--- /dev/null
+++ b/notebooks/01-quickstart.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# atomref quickstart\n\nThis notebook covers the main public API: element helpers, direct\n`get_*` calls, provenance-carrying `lookup_*` calls, and packaged dataset\ndiscovery.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import atomref as ar\n",
+ "\n",
+ "print(ar.get_element('Cl'))\n",
+ "print(ar.list_quantities())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "r_c = ar.get_covalent_radius('C')\n",
+ "r_vdw = ar.get_vdw_radius('O')\n",
+ "print(r_c)\n",
+ "print(r_vdw)\n",
+ "assert r_c == 0.76\n",
+ "assert r_vdw == 1.50\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lookup = ar.lookup_vdw_radius('Pm')\n",
+ "print(f\"{lookup.value:.12f}\")\n",
+ "print(lookup.source)\n",
+ "print(lookup.resolved_from)\n",
+ "assert lookup.source == 'transfer_linear'\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "quantity = ar.get_quantity_info('atomic_radius')\n",
+ "print(quantity.quantity, quantity.domain, quantity.units)\n",
+ "\n",
+ "for info in ar.list_dataset_infos('van_der_waals_radius', usage_role='target'):\n",
+ " print(info.ref.set_id, info.name, info.usage_role)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vdw = ar.get_radii_set('van_der_waals', 'alvarez2013')\n",
+ "print(vdw.get('O'))\n",
+ "\n",
+ "support = ar.get_builtin_set(ar.DatasetRef('atomic_radius', 'rahm2016'))\n",
+ "print(support.get('Pm'))\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/02-policies-and-assessment.ipynb b/notebooks/02-policies-and-assessment.ipynb
new file mode 100644
index 0000000..dfe2678
--- /dev/null
+++ b/notebooks/02-policies-and-assessment.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Policies and assessment\n",
+ "\n",
+ "This notebook shows how `atomref` resolves missing values through ordered\n",
+ "policy steps and how to inspect policy-level behavior.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import atomref as ar\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "covalent_policy = ar.RadiiPolicy(\n",
+ " kind='covalent',\n",
+ " base_set='cordero2008',\n",
+ " transfers=(\n",
+ " ar.SubstitutionTransfer(\n",
+ " source=ar.DatasetRef('covalent_radius', 'csd_legacy_cov')\n",
+ " ),\n",
+ " ),\n",
+ ")\n",
+ "lookup = ar.lookup_covalent_radius('Bk', policy=covalent_policy)\n",
+ "print(lookup.source)\n",
+ "print(f\"{lookup.value:.12f}\")\n",
+ "print(lookup.resolved_from)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vdw_policy = ar.RadiiPolicy(\n",
+ " kind='van_der_waals',\n",
+ " base_set='alvarez2013',\n",
+ " transfers=(\n",
+ " ar.LinearTransfer(\n",
+ " predictors=(ar.DatasetRef('atomic_radius', 'rahm2016'),)\n",
+ " ),\n",
+ " ),\n",
+ ")\n",
+ "lookup = ar.lookup_vdw_radius('Pm', policy=vdw_policy)\n",
+ "print(f\"{lookup.value:.12f}\")\n",
+ "print(lookup.source)\n",
+ "print(\n",
+ " f\"slope={lookup.fit.coefficients[0]:.12f} intercept={lookup.fit.intercept:.12f} n={lookup.fit.n_points}\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assessment = ar.assess_radii_policy(\n",
+ " ['C', 'Xe', 'Pm', 'Bk'],\n",
+ " policy=vdw_policy,\n",
+ " detail=True,\n",
+ ")\n",
+ "print(assessment.n_base, assessment.n_transfer_linear, assessment.n_missing)\n",
+ "for row in assessment.per_element:\n",
+ " value = 'None' if row.lookup.value is None else f\"{row.lookup.value:.12f}\"\n",
+ " print(row.symbol, row.lookup.source, value)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/03-custom-sets-and-discovery.ipynb b/notebooks/03-custom-sets-and-discovery.ipynb
new file mode 100644
index 0000000..58f9d92
--- /dev/null
+++ b/notebooks/03-custom-sets-and-discovery.ipynb
@@ -0,0 +1,79 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Custom sets and dataset discovery\n",
+ "\n",
+ "This notebook shows how to define a small user-provided set, plug it into a\n",
+ "policy, and inspect the packaged dataset catalog.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import atomref as ar\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "custom_cov = ar.ElementScalarSet.from_mapping(\n",
+ " ref=ar.DatasetRef(\"covalent_radius\", \"demo_user_cov\"),\n",
+ " values={\"C\": 0.77, \"O\": 0.67},\n",
+ " name=\"Demo user covalent set\",\n",
+ " units=\"angstrom\",\n",
+ " description=\"Example custom set for notebook usage.\",\n",
+ " notes=(\"Notebook example\",),\n",
+ ")\n",
+ "\n",
+ "policy = ar.RadiiPolicy(\n",
+ " kind=\"covalent\",\n",
+ " base_set=custom_cov,\n",
+ " transfers=(\n",
+ " ar.SubstitutionTransfer(\n",
+ " source=ar.DatasetRef(\"covalent_radius\", \"cordero2008\")\n",
+ " ),\n",
+ " ),\n",
+ ")\n",
+ "\n",
+ "for symbol in (\"C\", \"O\", \"N\"):\n",
+ " print(symbol, ar.lookup_covalent_radius(symbol, policy=policy))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for info in ar.list_radii_set_infos(\"van_der_waals\", usage_role=\"target\"):\n",
+ " print(info.ref.set_id, info.semantic_class, info.origin_class, info.phase_context)\n",
+ "\n",
+ "rahm = ar.get_dataset_info(ar.DatasetRef(\"atomic_radius\", \"rahm2016\"))\n",
+ "print(rahm.name)\n",
+ "print(rahm.semantic_class, rahm.phase_context, rahm.usage_role)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b712101
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,97 @@
+[build-system]
+requires = ["hatchling>=1.24"]
+build-backend = "hatchling.build"
+
+[project]
+name = "atomref"
+dynamic = ["version"]
+description = "Curated atomic reference data and transfer policies for geometry and structure-analysis algorithms."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+authors = [
+ { name = "Ivan Yu. Chernyshov", email = "ivan.chernyshoff@gmail.com" }
+]
+keywords = ["chemistry", "materials", "crystallography", "reference data", "atomic radii"]
+classifiers = [
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: Science/Research",
+ "Topic :: Scientific/Engineering :: Chemistry",
+ "Topic :: Software Development :: Libraries",
+ "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Operating System :: OS Independent",
+ "Typing :: Typed",
+]
+dependencies = []
+
+[project.urls]
+Homepage = "https://delonecommons.github.io/atomref/"
+Documentation = "https://delonecommons.github.io/atomref/"
+Repository = "https://github.com/DeloneCommons/atomref"
+Issues = "https://github.com/DeloneCommons/atomref/issues"
+Changelog = "https://github.com/DeloneCommons/atomref/blob/main/CHANGELOG.md"
+
+[project.optional-dependencies]
+test = [
+ "pytest>=7",
+ "tomli>=2; python_version < '3.11'",
+]
+docs = [
+ "mkdocs>=1.6,<2",
+ "mkdocs-material>=9.5",
+ "mkdocstrings[python]>=0.25",
+ "mkdocs-include-markdown-plugin>=6.2",
+ "pymdown-extensions>=10.0",
+ "tomli>=2; python_version < '3.11'",
+]
+dev = [
+ "build>=1.2",
+ "twine>=5",
+ "flake8>=7",
+]
+
+[tool.hatch.version]
+path = "src/atomref/__about__.py"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/atomref"]
+include = [
+ "src/atomref/data/*.csv",
+ "src/atomref/data/*.json",
+]
+
+[tool.hatch.build.targets.sdist]
+include = [
+ "/src",
+ "/tests",
+ "/docs",
+ "/tools",
+ "/notebooks",
+ "/mkdocs.yml",
+ "/README.md",
+ "/CHANGELOG.md",
+ "/DEV_PLAN.md",
+ "/NOTICE.md",
+ "/LICENSE",
+ "/COPYING",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-ra --ignore=build --ignore=dist"
+norecursedirs = [
+ ".git",
+ ".pytest_cache",
+ "__pycache__",
+ ".venv",
+ ".tox",
+ "dist",
+ ".eggs",
+ "*.egg-info",
+]
diff --git a/src/atomref/__about__.py b/src/atomref/__about__.py
new file mode 100644
index 0000000..bbab024
--- /dev/null
+++ b/src/atomref/__about__.py
@@ -0,0 +1 @@
+__version__ = "0.1.4"
diff --git a/src/atomref/__init__.py b/src/atomref/__init__.py
new file mode 100644
index 0000000..fb569b3
--- /dev/null
+++ b/src/atomref/__init__.py
@@ -0,0 +1,102 @@
+"""Public package exports for :mod:`atomref`."""
+
+from .__about__ import __version__
+from .elements import (
+ Element,
+ canonicalize_element_symbol,
+ get_element,
+ iter_elements,
+ is_valid_element_symbol,
+)
+from .policy import LookupResult, ValuePolicy, get_value, lookup_value
+from .radii import (
+ DEFAULT_COVALENT_POLICY,
+ DEFAULT_VDW_POLICY,
+ RadiiElementAssessment,
+ RadiiPolicy,
+ RadiiPolicyAssessment,
+ assess_radii_policy,
+ get_covalent_radius,
+ get_radii_set,
+ get_radii_set_info,
+ get_vdw_radius,
+ list_radii_set_infos,
+ list_radii_sets,
+ lookup_covalent_radius,
+ lookup_vdw_radius,
+)
+from .xh import (
+ DEFAULT_XH_POLICY,
+ XHPolicy,
+ get_xh_bond_length,
+ get_xh_set,
+ get_xh_set_info,
+ list_xh_set_infos,
+ list_xh_sets,
+ lookup_xh_bond_length,
+)
+from .registry import (
+ CoverageInfo,
+ DatasetInfo,
+ DatasetRef,
+ ElementScalarSet,
+ QuantityInfo,
+ Reference,
+ get_builtin_set,
+ get_dataset_info,
+ get_quantity_info,
+ list_dataset_ids,
+ list_dataset_infos,
+ list_quantities,
+)
+from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer
+
+__all__ = [
+ "__version__",
+ "Element",
+ "canonicalize_element_symbol",
+ "get_element",
+ "iter_elements",
+ "is_valid_element_symbol",
+ "CoverageInfo",
+ "DatasetInfo",
+ "DatasetRef",
+ "ElementScalarSet",
+ "QuantityInfo",
+ "Reference",
+ "get_builtin_set",
+ "get_dataset_info",
+ "get_quantity_info",
+ "list_dataset_ids",
+ "list_dataset_infos",
+ "list_quantities",
+ "LinearFit",
+ "LinearTransfer",
+ "SubstitutionTransfer",
+ "LookupResult",
+ "ValuePolicy",
+ "lookup_value",
+ "get_value",
+ "RadiiPolicy",
+ "RadiiElementAssessment",
+ "RadiiPolicyAssessment",
+ "DEFAULT_COVALENT_POLICY",
+ "DEFAULT_VDW_POLICY",
+ "list_radii_sets",
+ "list_radii_set_infos",
+ "get_radii_set",
+ "get_radii_set_info",
+ "lookup_covalent_radius",
+ "get_covalent_radius",
+ "lookup_vdw_radius",
+ "get_vdw_radius",
+ "assess_radii_policy",
+ "XHPolicy",
+ "DEFAULT_XH_POLICY",
+ "list_xh_sets",
+ "list_xh_set_infos",
+ "get_xh_set",
+ "get_xh_set_info",
+ "lookup_xh_bond_length",
+ "get_xh_bond_length",
+]
diff --git a/src/atomref/data/__init__.py b/src/atomref/data/__init__.py
new file mode 100644
index 0000000..835d4e0
--- /dev/null
+++ b/src/atomref/data/__init__.py
@@ -0,0 +1 @@
+"""Packaged data files for atomref."""
diff --git a/src/atomref/data/covalent.csv b/src/atomref/data/covalent.csv
new file mode 100644
index 0000000..053a71a
--- /dev/null
+++ b/src/atomref/data/covalent.csv
@@ -0,0 +1,119 @@
+z,cordero2008,csd_legacy_cov
+1,0.31,0.23
+2,0.28,1.5
+3,1.28,1.28
+4,0.96,0.96
+5,0.84,0.83
+6,0.76,0.68
+7,0.71,0.68
+8,0.66,0.68
+9,0.57,0.64
+10,0.58,1.5
+11,1.66,1.66
+12,1.41,1.41
+13,1.21,1.21
+14,1.11,1.2
+15,1.07,1.05
+16,1.05,1.02
+17,1.02,0.99
+18,1.06,1.51
+19,2.03,2.03
+20,1.76,1.76
+21,1.7,1.7
+22,1.6,1.6
+23,1.53,1.53
+24,1.39,1.39
+25,1.61,1.61
+26,1.52,1.52
+27,1.5,1.26
+28,1.24,1.24
+29,1.32,1.32
+30,1.22,1.22
+31,1.22,1.22
+32,1.2,1.17
+33,1.19,1.21
+34,1.2,1.22
+35,1.2,1.21
+36,1.16,1.5
+37,2.2,2.2
+38,1.95,1.95
+39,1.9,1.9
+40,1.75,1.75
+41,1.64,1.64
+42,1.54,1.54
+43,1.47,1.47
+44,1.46,1.46
+45,1.42,1.42
+46,1.39,1.39
+47,1.45,1.45
+48,1.44,1.54
+49,1.42,1.42
+50,1.39,1.39
+51,1.39,1.39
+52,1.38,1.47
+53,1.39,1.4
+54,1.4,1.5
+55,2.44,2.44
+56,2.15,2.15
+57,2.07,2.07
+58,2.04,2.04
+59,2.03,2.03
+60,2.01,2.01
+61,1.99,1.99
+62,1.98,1.98
+63,1.98,1.98
+64,1.96,1.96
+65,1.94,1.94
+66,1.92,1.92
+67,1.92,1.92
+68,1.89,1.89
+69,1.9,1.9
+70,1.87,1.87
+71,1.87,1.87
+72,1.75,1.75
+73,1.7,1.7
+74,1.62,1.62
+75,1.51,1.51
+76,1.44,1.44
+77,1.41,1.41
+78,1.36,1.36
+79,1.36,1.36
+80,1.32,1.32
+81,1.45,1.45
+82,1.46,1.46
+83,1.48,1.48
+84,1.4,1.4
+85,1.5,1.21
+86,1.5,1.5
+87,2.6,2.6
+88,2.21,2.21
+89,2.15,2.15
+90,2.06,2.06
+91,2,2
+92,1.96,1.96
+93,1.9,1.9
+94,1.87,1.87
+95,1.8,1.8
+96,1.69,1.69
+97,,1.54
+98,,1.83
+99,,1.5
+100,,1.5
+101,,1.5
+102,,1.5
+103,,1.5
+104,,1.5
+105,,1.5
+106,,1.5
+107,,1.5
+108,,1.5
+109,,1.5
+110,,1.5
+111,,
+112,,
+113,,
+114,,
+115,,
+116,,
+117,,
+118,,
diff --git a/src/atomref/data/periodic_table.csv b/src/atomref/data/periodic_table.csv
new file mode 100644
index 0000000..744b4aa
--- /dev/null
+++ b/src/atomref/data/periodic_table.csv
@@ -0,0 +1,119 @@
+z,symbol,name
+1,H,Hydrogen
+2,He,Helium
+3,Li,Lithium
+4,Be,Beryllium
+5,B,Boron
+6,C,Carbon
+7,N,Nitrogen
+8,O,Oxygen
+9,F,Fluorine
+10,Ne,Neon
+11,Na,Sodium
+12,Mg,Magnesium
+13,Al,Aluminium
+14,Si,Silicon
+15,P,Phosphorus
+16,S,Sulphur
+17,Cl,Chlorine
+18,Ar,Argon
+19,K,Potassium
+20,Ca,Calcium
+21,Sc,Scandium
+22,Ti,Titanium
+23,V,Vanadium
+24,Cr,Chromium
+25,Mn,Manganese
+26,Fe,Iron
+27,Co,Cobalt
+28,Ni,Nickel
+29,Cu,Copper
+30,Zn,Zinc
+31,Ga,Gallium
+32,Ge,Germanium
+33,As,Arsenic
+34,Se,Selenium
+35,Br,Bromine
+36,Kr,Krypton
+37,Rb,Rubidium
+38,Sr,Strontium
+39,Y,Yttrium
+40,Zr,Zirconium
+41,Nb,Niobium
+42,Mo,Molybdenum
+43,Tc,Technetium
+44,Ru,Ruthenium
+45,Rh,Rhodium
+46,Pd,Palladium
+47,Ag,Silver
+48,Cd,Cadmium
+49,In,Indium
+50,Sn,Tin
+51,Sb,Antimony
+52,Te,Tellurium
+53,I,Iodine
+54,Xe,Xenon
+55,Cs,Caesium
+56,Ba,Barium
+57,La,Lanthanum
+58,Ce,Cerium
+59,Pr,Praseodymium
+60,Nd,Neodymium
+61,Pm,Promethium
+62,Sm,Samarium
+63,Eu,Europium
+64,Gd,Gadolinium
+65,Tb,Terbium
+66,Dy,Dysprosium
+67,Ho,Holmium
+68,Er,Erbium
+69,Tm,Thulium
+70,Yb,Ytterbium
+71,Lu,Lutetium
+72,Hf,Hafnium
+73,Ta,Tantalum
+74,W,Tungsten
+75,Re,Rhenium
+76,Os,Osmium
+77,Ir,Iridium
+78,Pt,Platinum
+79,Au,Gold
+80,Hg,Mercury
+81,Tl,Thallium
+82,Pb,Lead
+83,Bi,Bismuth
+84,Po,Polonium
+85,At,Astatine
+86,Rn,Radon
+87,Fr,Francium
+88,Ra,Radium
+89,Ac,Actinium
+90,Th,Thorium
+91,Pa,Protactinium
+92,U,Uranium
+93,Np,Neptunium
+94,Pu,Plutonium
+95,Am,Americium
+96,Cm,Curium
+97,Bk,Berkelium
+98,Cf,Californium
+99,Es,Einsteinium
+100,Fm,Fermium
+101,Md,Mendelevium
+102,No,Nobelium
+103,Lr,Lawrencium
+104,Rf,Rutherfordium
+105,Db,Dubnium
+106,Sg,Seaborgium
+107,Bh,Bohrium
+108,Hs,Hassium
+109,Mt,Meitnerium
+110,Ds,Darmstadtium
+111,Rg,Roentgenium
+112,Cn,Copernicium
+113,Nh,Nihonium
+114,Fl,Flerovium
+115,Mc,Moscovium
+116,Lv,Livermorium
+117,Ts,Tennessine
+118,Og,Oganesson
diff --git a/src/atomref/data/registry.json b/src/atomref/data/registry.json
new file mode 100644
index 0000000..e6e4469
--- /dev/null
+++ b/src/atomref/data/registry.json
@@ -0,0 +1,506 @@
+{
+ "schema_version": "0.1",
+ "created_from": {
+ "source_project": "molcryst",
+ "source_schema_version": "0.2",
+ "notes": [
+ "Transformed for the initial atomref v0.1 scaffold.",
+ "Rahm 2016 is reclassified from van_der_waals to atomic_radius."
+ ]
+ },
+ "quantities": {
+ "covalent_radius": {
+ "domain": "element",
+ "units": "angstrom",
+ "description": "Element-indexed covalent radii intended for geometry and bonding heuristics."
+ },
+ "van_der_waals_radius": {
+ "domain": "element",
+ "units": "angstrom",
+ "description": "Element-indexed condensed-phase or contact-derived van der Waals radii."
+ },
+ "atomic_radius": {
+ "domain": "element",
+ "units": "angstrom",
+ "description": "Element-indexed isolated-atom or theory-defined atomic radii used as transferable support data."
+ },
+ "xh_bond_length": {
+ "domain": "element",
+ "units": "angstrom",
+ "description": "Element-indexed reference X-H bond lengths keyed by parent element X and intended for hydrogen-position normalisation or related geometry workflows."
+ }
+ },
+ "datasets": {
+ "covalent_radius": {
+ "cordero2008": {
+ "name": "Cordero et al. covalent radii",
+ "description": "Covalent radii from Cordero et al. (2008) (last author: Alvarez).",
+ "semantic_class": "covalent_structural",
+ "origin_class": "compiled_experimental",
+ "phase_context": "condensed_phase",
+ "method_summary": "Derived from crystallographic bond distances (primarily single bonds) across the periodic table.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "covalent.csv",
+ "column": "cordero2008"
+ },
+ "coverage": {
+ "n_values": 96,
+ "z_min": 1,
+ "z_max": 96,
+ "has_placeholders": false
+ },
+ "placeholder_value": null,
+ "extraction_source": "Table 2 in B. Cordero et al. (2008), column 'r'",
+ "aliases": [
+ "Cordero covalent radii",
+ "Cordero–Alvarez covalent radii",
+ "Alvarez covalent radii (2008)"
+ ],
+ "references": [
+ {
+ "authors": "B. Cordero et al.",
+ "doi": "10.1039/B801115J",
+ "title": "Covalent radii revisited",
+ "venue": "Dalton Trans. (2008) 2832-2838"
+ }
+ ],
+ "notes": [
+ "The source paper provides multiple radii per element for different atom types/environments; this package currently includes C(sp3) value for C and high-spin values for Mn/Fe/Co."
+ ],
+ "usage_role": "target"
+ },
+ "csd_legacy_cov": {
+ "name": "CSD legacy covalent radii (bond perception)",
+ "description": "Legacy covalent radii used in CSD software for bond assignment (Rcov).",
+ "semantic_class": "covalent_legacy",
+ "origin_class": "curated_heuristic",
+ "phase_context": "mixed_or_legacy",
+ "method_summary": null,
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "covalent.csv",
+ "column": "csd_legacy_cov"
+ },
+ "coverage": {
+ "n_values": 110,
+ "z_min": 1,
+ "z_max": 110,
+ "has_placeholders": true
+ },
+ "placeholder_value": 1.5,
+ "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'Covalent Radius'.",
+ "aliases": [],
+ "references": [
+ {
+ "publisher": "Cambridge Crystallographic Data Centre (CCDC)",
+ "title": "Elemental Data and Radii (Excel)",
+ "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx"
+ },
+ {
+ "authors": "B. Cordero et al.",
+ "doi": "10.1039/B801115J",
+ "title": "Covalent radii revisited",
+ "venue": "Dalton Trans. (2008) 2832-2838"
+ }
+ ],
+ "notes": [
+ "CSD bond assignment heuristic: a bond A-B may be inferred if distance d satisfies Rcov(A)+Rcov(B)-t <= d <= Rcov(A)+Rcov(B)+t, with typical t=0.4 Å. (See the CCDC spreadsheet notes.)",
+ "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).",
+ "Elements not yet encountered in the CSD have Rcov = 1.50 Å."
+ ],
+ "usage_role": "support"
+ }
+ },
+ "van_der_waals_radius": {
+ "bondi1964": {
+ "name": "Bondi van der Waals radii",
+ "description": "Classic van der Waals radii compiled by Bondi (1964), available for 38 elements.",
+ "semantic_class": "vdw_compiled",
+ "origin_class": "compiled_experimental",
+ "phase_context": "mixed_or_legacy",
+ "method_summary": "Bondi compiled van der Waals radii from a combination of experimental sources (e.g., crystal structures, liquid-state properties, gas kinetic data) to reproduce molecular/atomic volumes and sizes. This set is widely used as a historical reference and in many computational chemistry defaults.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "bondi1964"
+ },
+ "coverage": {
+ "n_values": 38,
+ "z_min": 1,
+ "z_max": 92,
+ "has_placeholders": false,
+ "covered_z": [
+ 1,
+ 2,
+ 3,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 14,
+ 15,
+ 16,
+ 17,
+ 18,
+ 19,
+ 28,
+ 29,
+ 30,
+ 31,
+ 33,
+ 34,
+ 35,
+ 36,
+ 46,
+ 47,
+ 48,
+ 49,
+ 50,
+ 52,
+ 53,
+ 54,
+ 78,
+ 79,
+ 80,
+ 81,
+ 82,
+ 92
+ ]
+ },
+ "placeholder_value": null,
+ "extraction_source": "Bondi column in Table 1 of Alvarez (2013) (used as a convenient transcription of Bondi's tabulation).",
+ "aliases": [
+ "Bondi radii",
+ "Bondi vdW radii"
+ ],
+ "references": [
+ {
+ "authors": "A. Bondi",
+ "title": "van der Waals Volumes and Radii",
+ "venue": "J. Phys. Chem. 68 (1964) 441-451",
+ "doi": "10.1021/j100785a001"
+ },
+ {
+ "authors": "S. Alvarez",
+ "title": "A cartography of the van der Waals territories",
+ "venue": "Dalton Trans. 42 (2013) 8617-8636",
+ "doi": "10.1039/C3DT50599E",
+ "note": "Table 1 reproduces Bondi radii for 38 elements."
+ }
+ ],
+ "notes": [
+ "Coverage is limited (38 elements, including only a few transition metals and uranium).",
+ "Because Bondi radii were not derived exclusively from crystal nonbonded contact statistics, they can differ slightly from later 'structural' vdW radii."
+ ],
+ "usage_role": "target"
+ },
+ "rowland_taylor1996": {
+ "name": "Rowland & Taylor nonbonded contact radii",
+ "description": "Nonbonded contact radii derived from organic crystal structures (Rowland & Taylor, 1996).",
+ "semantic_class": "vdw_structural",
+ "origin_class": "structural",
+ "phase_context": "condensed_phase",
+ "method_summary": "Rowland & Taylor analyzed distributions of intermolecular nonbonded contact distances in organic crystal structures from the Cambridge Structural Database (CSD). They fitted/estimated characteristic contact distances and solved for per-element radii by least-squares analysis over many element-pair distance distributions.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "rowland_taylor1996"
+ },
+ "coverage": {
+ "n_values": 9,
+ "z_min": 1,
+ "z_max": 53,
+ "has_placeholders": false,
+ "covered_z": [
+ 1,
+ 6,
+ 7,
+ 8,
+ 9,
+ 16,
+ 17,
+ 35,
+ 53
+ ]
+ },
+ "placeholder_value": null,
+ "extraction_source": "Table 3 in Rowland & Taylor (1996), column 'r_c' (least-squares radii, not the normalized R_d column).",
+ "aliases": [
+ "Rowland–Taylor radii",
+ "Rowland & Taylor vdW radii"
+ ],
+ "references": [
+ {
+ "authors": "R. S. Rowland; R. Taylor",
+ "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii",
+ "venue": "J. Phys. Chem. 100 (1996) 7384-7391",
+ "doi": "10.1021/jp953141+"
+ }
+ ],
+ "notes": [
+ "Coverage is intentionally limited to common organic-crystal nonmetals (H, C, N, O, F, S, Cl, Br, I).",
+ "Rowland & Taylor also report a normalized set (R_d) constrained to match the total of Bondi radii; this package uses the raw least-squares r_c values."
+ ],
+ "usage_role": "target"
+ },
+ "alvarez2013": {
+ "name": "Alvarez van der Waals radii",
+ "description": "van der Waals radii from Alvarez (2013).",
+ "semantic_class": "vdw_structural",
+ "origin_class": "structural",
+ "phase_context": "condensed_phase",
+ "method_summary": null,
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "alvarez2013"
+ },
+ "coverage": {
+ "n_values": 93,
+ "z_min": 1,
+ "z_max": 99,
+ "has_placeholders": false,
+ "missing_z": [
+ 61,
+ 84,
+ 85,
+ 86,
+ 87,
+ 88
+ ]
+ },
+ "placeholder_value": null,
+ "extraction_source": "Table 1 in Alvarez (2013), column 'r_vdW'.",
+ "aliases": [
+ "Alvarez vdW radii",
+ "Alvarez (2013) r_vdW",
+ "Dalton Trans. vdW cartography radii"
+ ],
+ "references": [
+ {
+ "authors": "S. Alvarez",
+ "doi": "10.1039/C3DT50599E",
+ "title": "A cartography of the van der Waals territories",
+ "venue": "Dalton Trans. 42 (2013) 8617-8636"
+ }
+ ],
+ "notes": [
+ "Obtained by statistical analysis of millions of interatomic distances in the Cambridge Structural Database (CSD), locating the vdW peak after the vdW gap."
+ ],
+ "usage_role": "target"
+ },
+ "chernyshov2020": {
+ "name": "Chernyshov LoS van der Waals radii",
+ "description": "van der Waals radii from Chernyshov et al. (ChemPhysChem 2020) using line-of-sight (LoS) classification of direct contacts.",
+ "semantic_class": "vdw_structural_typed_reduced",
+ "origin_class": "structural",
+ "phase_context": "condensed_phase",
+ "method_summary": "Chernyshov et al. introduce a line-of-sight (LoS) criterion to identify 'direct' interatomic contacts in complex molecular crystals. vdW radii are then inferred from statistically analyzed contact-distance distributions for specific atom types, yielding radii (including R_half and R_max variants) intended to better reflect steric/anistropic effects than simple distance-based heuristics.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "chernyshov2020"
+ },
+ "coverage": {
+ "n_values": 10,
+ "z_min": 1,
+ "z_max": 53,
+ "has_placeholders": false,
+ "covered_z": [
+ 1,
+ 6,
+ 7,
+ 8,
+ 9,
+ 16,
+ 17,
+ 34,
+ 35,
+ 53
+ ]
+ },
+ "placeholder_value": null,
+ "extraction_source": "Table 1 in Chernyshov et al. (2020): R_max values for the 'default' atom types typical for organic compounds.",
+ "aliases": [
+ "LoS vdW radii",
+ "Chernyshov vdW radii"
+ ],
+ "references": [
+ {
+ "authors": "I. Yu. Chernyshov; I. V. Ananyev; E. A. Pidko",
+ "title": "Revisiting van der Waals Radii: From Comprehensive Structural Analysis to Knowledge-Based Classification of Interatomic Contacts",
+ "venue": "ChemPhysChem 21 (2020) 1–8",
+ "doi": "10.1002/cphc.201901083"
+ }
+ ],
+ "notes": [
+ "The source paper provides multiple radii per element for different atom types/environments; this package currently includes only the main/default R_max values used in Table 1.",
+ "Primarily targeted at elements common in organic crystals (H, C, N, O, F, S, Cl, Se, Br, I)."
+ ],
+ "usage_role": "target"
+ },
+ "csd_legacy_vdw": {
+ "name": "CSD legacy van der Waals radii (pre-2024.3)",
+ "description": "Legacy van der Waals radii historically used in CSD tools (pre-2024.3).",
+ "semantic_class": "vdw_legacy",
+ "origin_class": "curated_heuristic",
+ "phase_context": "mixed_or_legacy",
+ "method_summary": null,
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "csd_legacy_vdw"
+ },
+ "coverage": {
+ "n_values": 110,
+ "z_min": 1,
+ "z_max": 110,
+ "has_placeholders": true
+ },
+ "placeholder_value": 2.0,
+ "extraction_source": "CCDC Elemental_Radii.xlsx (CSD radii table), column 'vdW Radius' (Bondi/Rowland-Taylor based with defaults).",
+ "aliases": [],
+ "references": [
+ {
+ "authors": "A. Bondi",
+ "doi": "10.1021/j100785a001",
+ "title": "van der Waals Volumes and Radii",
+ "venue": "J. Phys. Chem. 68 (1964) 441-451"
+ },
+ {
+ "authors": "R. S. Rowland; R. Taylor",
+ "doi": "10.1021/jp953141+",
+ "title": "Intermolecular Nonbonded Contact Distances in Organic Crystal Structures: Comparison with Distances Expected from van der Waals Radii",
+ "venue": "J. Phys. Chem. 100 (1996) 7384-7391"
+ },
+ {
+ "publisher": "CCDC",
+ "title": "Elemental Data and Radii (Excel)",
+ "url": "https://www.ccdc.cam.ac.uk/media/Documentation/F8D8439E-30C5-4FA8-B781-D9E65AAB0BF3/Elemental_Radii.xlsx"
+ },
+ {
+ "publisher": "CCDC blog",
+ "title": "Updates to van der Waals radii used in the CSD and Mercury",
+ "url": "https://www.ccdc.cam.ac.uk/discover/blog/updates-to-van-der-waals-radii-csd-mercury/"
+ }
+ ],
+ "notes": [
+ "For Z>=111, csd_legacy values are omitted because the legacy CSD table does not provide radii beyond Darmstadtium (Z=110).",
+ "Radii that are not available in either Bondi or Rowland & Taylor versions were assigned RvdW of 2.00 Å.",
+ "The CSD 2024.3 release updated the vdW radii used in CSD and Mercury to Alvarez-derived values (see CCDC blog post)."
+ ],
+ "usage_role": "support"
+ }
+ },
+ "atomic_radius": {
+ "rahm2016": {
+ "name": "Rahm isodensity atomic radii (ρ=0.001 e/bohr³)",
+ "description": "Computed atomic radii for neutral atoms (elements 1–96) defined by the ρ=0.001 e/bohr³ electron-density isosurface (Rahm et al., 2016).",
+ "semantic_class": "atomic_isodensity",
+ "origin_class": "computational",
+ "phase_context": "isolated_atom",
+ "method_summary": "Rahm et al. computed relativistic all-electron DFT electron densities (close to the basis-set limit) for isolated atoms and ions. Radii are defined by an electron-density threshold, producing a consistent, theory-based size measure that correlates well with structural van der Waals radii derived from crystal structures.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "van_der_waals.csv",
+ "column": "rahm2016"
+ },
+ "coverage": {
+ "n_values": 96,
+ "z_min": 1,
+ "z_max": 96,
+ "has_placeholders": false
+ },
+ "placeholder_value": null,
+ "extraction_source": "Supporting Information for Rahm et al. (2016), Table S1: neutral-atom radii for elements 1–96.",
+ "aliases": [
+ "Rahm radii",
+ "Rahm–Hoffmann–Ashcroft atomic radii",
+ "0.001 e/bohr^3 radii"
+ ],
+ "references": [
+ {
+ "authors": "M. Rahm; R. Hoffmann; N. W. Ashcroft",
+ "title": "Atomic and Ionic Radii of Elements 1–96",
+ "venue": "Chem. Eur. J. 22 (2016) 14625–14632",
+ "doi": "10.1002/chem.201602949"
+ },
+ {
+ "title": "Chem. Eur. J. 2016, 22, 14625–14632 (Rahm et al.) – Misc. Information",
+ "url": "http://dx.doi.org/10.1002/chem.201602949",
+ "publisher": "Supporting Information",
+ "note": "Table S1 contains the neutral-atom radii used here."
+ }
+ ],
+ "notes": [
+ "The original work also reports cationic radii (+1) for the first 96 elements and selected anionic radii (−1) for some elements; these are not yet included in the current CSV.",
+ "In atomref this dataset is classified as atomic support data, not as a direct condensed-phase van der Waals-radius set, because it describes isolated atoms in vacuum and is used here primarily as a transferable baseline."
+ ],
+ "usage_role": "support"
+ }
+ },
+ "xh_bond_length": {
+ "csd_legacy_xh_cno": {
+ "name": "CSD legacy X-H neutron-normalisation targets (C/N/O)",
+ "description": "Fixed C-H, N-H, and O-H target bond lengths used by ConQuest for hydrogen-position normalisation.",
+ "semantic_class": "xh_neutron_normalisation",
+ "origin_class": "compiled_experimental",
+ "phase_context": "condensed_phase",
+ "method_summary": "Sparse parent-element target set for hydrogen normalisation. ConQuest moves H along the experimentally determined X-H vector to these neutron-derived distances.",
+ "storage": {
+ "format": "dense_by_z_csv",
+ "filename": "xh_bond_length.csv",
+ "column": "csd_legacy_xh_cno"
+ },
+ "coverage": {
+ "n_values": 3,
+ "z_min": 6,
+ "z_max": 8,
+ "has_placeholders": false,
+ "covered_z": [
+ 6,
+ 7,
+ 8
+ ],
+ "missing_z": [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5
+ ]
+ },
+ "placeholder_value": null,
+ "extraction_source": "ConQuest User Guide and Tutorials, section 'Hydrogen Atom Location in Crystal Structure Analyses'.",
+ "aliases": [
+ "CSD X-H normalisation defaults",
+ "ConQuest X-H normalisation",
+ "CSD legacy X-H"
+ ],
+ "references": [
+ {
+ "publisher": "Cambridge Crystallographic Data Centre (CCDC)",
+ "title": "ConQuest User Guide and Tutorials",
+ "url": "https://www.ccdc.cam.ac.uk/media/Documentation/C82017ED-FAE4-4D93-BA5A-8D841F1E4314/ConQuest-UserGuide_2020_1.pdf",
+ "note": "Hydrogen Atom Location in Crystal Structure Analyses; ConQuest normalises terminal C-H, N-H, and O-H distances to 1.089 Å, 1.015 Å, and 0.993 Å, respectively."
+ },
+ {
+ "authors": "F. H. Allen; I. J. Bruno",
+ "title": "Bond lengths in organic and metal-organic compounds revisited: X-H bond lengths from neutron diffraction data",
+ "venue": "Acta Cryst. B66 (2010) 380-386"
+ }
+ ],
+ "notes": [
+ "Sparse provisional target set for parent elements C, N, and O only.",
+ "In atomref v0.1.x this dataset seeds transfer-based inference for other parent elements rather than claiming direct curated coverage beyond C/N/O.",
+ "Fuller X-H dataset and policy support is planned for atomref 0.2.x."
+ ],
+ "usage_role": "target"
+ }
+ }
+ }
+}
diff --git a/src/atomref/data/van_der_waals.csv b/src/atomref/data/van_der_waals.csv
new file mode 100644
index 0000000..86e7be3
--- /dev/null
+++ b/src/atomref/data/van_der_waals.csv
@@ -0,0 +1,119 @@
+z,bondi1964,rowland_taylor1996,alvarez2013,chernyshov2020,csd_legacy_vdw,rahm2016
+1,1.2,1.1,1.2,1.21,1.09,1.54
+2,1.4,,1.43,,1.4,1.34
+3,1.81,,2.12,,1.82,2.2
+4,,,1.98,,2,2.19
+5,,,1.91,,2,2.05
+6,1.7,1.77,1.77,1.91,1.7,1.9
+7,1.55,1.64,1.66,1.76,1.55,1.79
+8,1.52,1.58,1.5,1.74,1.52,1.71
+9,1.47,1.46,1.46,1.55,1.47,1.63
+10,1.54,,1.58,,1.54,1.56
+11,2.27,,2.5,,2.27,2.25
+12,1.73,,2.51,,1.73,2.4
+13,,,2.25,,2,2.39
+14,2.22,,2.19,,2.1,2.32
+15,1.8,,1.9,,1.8,2.23
+16,1.8,1.81,1.89,1.95,1.8,2.14
+17,1.75,1.76,1.82,1.91,1.75,2.06
+18,1.76,,1.83,,1.88,1.97
+19,2.75,,2.73,,2.75,2.34
+20,,,2.62,,2,2.7
+21,,,2.58,,2,2.63
+22,,,2.46,,2,2.57
+23,,,2.42,,2,2.52
+24,,,2.45,,2,2.33
+25,,,2.45,,2,2.42
+26,,,2.44,,2,2.26
+27,,,2.4,,2,2.22
+28,1.63,,2.4,,1.63,2.19
+29,1.4,,2.38,,1.4,2.17
+30,1.39,,2.39,,1.39,2.22
+31,1.87,,2.32,,1.87,2.33
+32,,,2.29,,2,2.34
+33,1.85,,1.88,,1.85,2.31
+34,1.9,,1.82,2.04,1.9,2.24
+35,1.83,1.87,1.86,2,1.85,2.19
+36,2.02,,2.25,,2.02,2.12
+37,,,3.21,,2,2.4
+38,,,2.84,,2,2.79
+39,,,2.75,,2,2.74
+40,,,2.52,,2,2.68
+41,,,2.56,,2,2.51
+42,,,2.45,,2,2.44
+43,,,2.44,,2,2.41
+44,,,2.46,,2,2.37
+45,,,2.44,,2,2.33
+46,1.63,,2.15,,1.63,2.15
+47,1.72,,2.53,,1.72,2.25
+48,1.62,,2.49,,1.58,2.38
+49,1.93,,2.43,,1.93,2.46
+50,2.17,,2.42,,2.17,2.48
+51,,,2.47,,2,2.46
+52,2,,1.99,,2.06,2.42
+53,1.98,2.03,2.04,2.17,1.98,2.38
+54,2.16,,2.06,,2.16,2.32
+55,,,3.48,,2,2.49
+56,,,3.03,,2,2.93
+57,,,2.98,,2,2.84
+58,,,2.88,,2,2.82
+59,,,2.92,,2,2.86
+60,,,2.95,,2,2.84
+61,,,,,2,2.83
+62,,,2.9,,2,2.8
+63,,,2.87,,2,2.8
+64,,,2.83,,2,2.77
+65,,,2.79,,2,2.76
+66,,,2.87,,2,2.75
+67,,,2.81,,2,2.73
+68,,,2.83,,2,2.72
+69,,,2.79,,2,2.71
+70,,,2.8,,2,2.77
+71,,,2.74,,2,2.7
+72,,,2.63,,2,2.64
+73,,,2.53,,2,2.58
+74,,,2.57,,2,2.53
+75,,,2.49,,2,2.49
+76,,,2.48,,2,2.44
+77,,,2.41,,2,2.33
+78,1.72,,2.29,,1.72,2.3
+79,1.66,,2.32,,1.66,2.26
+80,1.7,,2.45,,1.55,2.29
+81,1.96,,2.47,,1.96,2.42
+82,2.02,,2.6,,2.02,2.49
+83,,,2.54,,2,2.5
+84,,,,,2,2.5
+85,,,,,2,2.47
+86,,,,,2,2.43
+87,,,,,2,2.58
+88,,,,,2,2.92
+89,,,2.8,,2,2.93
+90,,,2.93,,2,2.89
+91,,,2.88,,2,2.85
+92,1.86,,2.71,,1.86,2.83
+93,,,2.82,,2,2.8
+94,,,2.81,,2,2.78
+95,,,2.83,,2,2.76
+96,,,3.05,,2,2.76
+97,,,3.4,,2,
+98,,,3.05,,2,
+99,,,2.7,,2,
+100,,,,,2,
+101,,,,,2,
+102,,,,,2,
+103,,,,,2,
+104,,,,,2,
+105,,,,,2,
+106,,,,,2,
+107,,,,,2,
+108,,,,,2,
+109,,,,,2,
+110,,,,,2,
+111,,,,,,
+112,,,,,,
+113,,,,,,
+114,,,,,,
+115,,,,,,
+116,,,,,,
+117,,,,,,
+118,,,,,,
diff --git a/src/atomref/data/xh_bond_length.csv b/src/atomref/data/xh_bond_length.csv
new file mode 100644
index 0000000..4ae4bca
--- /dev/null
+++ b/src/atomref/data/xh_bond_length.csv
@@ -0,0 +1,119 @@
+z,csd_legacy_xh_cno
+1,
+2,
+3,
+4,
+5,
+6,1.089
+7,1.015
+8,0.993
+9,
+10,
+11,
+12,
+13,
+14,
+15,
+16,
+17,
+18,
+19,
+20,
+21,
+22,
+23,
+24,
+25,
+26,
+27,
+28,
+29,
+30,
+31,
+32,
+33,
+34,
+35,
+36,
+37,
+38,
+39,
+40,
+41,
+42,
+43,
+44,
+45,
+46,
+47,
+48,
+49,
+50,
+51,
+52,
+53,
+54,
+55,
+56,
+57,
+58,
+59,
+60,
+61,
+62,
+63,
+64,
+65,
+66,
+67,
+68,
+69,
+70,
+71,
+72,
+73,
+74,
+75,
+76,
+77,
+78,
+79,
+80,
+81,
+82,
+83,
+84,
+85,
+86,
+87,
+88,
+89,
+90,
+91,
+92,
+93,
+94,
+95,
+96,
+97,
+98,
+99,
+100,
+101,
+102,
+103,
+104,
+105,
+106,
+107,
+108,
+109,
+110,
+111,
+112,
+113,
+114,
+115,
+116,
+117,
+118,
diff --git a/src/atomref/elements.py b/src/atomref/elements.py
new file mode 100644
index 0000000..5245b80
--- /dev/null
+++ b/src/atomref/elements.py
@@ -0,0 +1,110 @@
+"""Periodic-table access for stable element identity."""
+
+from __future__ import annotations
+
+import csv
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from importlib import resources
+
+
+_MISSING_TOKENS = {"", "?", "."}
+_LEADING_ALPHA_RE = re.compile(r"([A-Za-z]{1,3})")
+
+
+@dataclass(frozen=True, slots=True)
+class Element:
+ """Chemical element identity keyed by atomic number and symbol."""
+
+ z: int
+ symbol: str
+ name: str
+
+
+def _normalize_element_token(token: str | None) -> str | None:
+ """Strip quotes and obvious missing-value markers from a token."""
+
+ if token is None:
+ return None
+
+ raw = token.strip()
+ if raw in _MISSING_TOKENS:
+ return None
+
+ if (raw.startswith("'") and raw.endswith("'")) or (
+ raw.startswith('"') and raw.endswith('"')
+ ):
+ raw = raw[1:-1].strip()
+ if raw in _MISSING_TOKENS:
+ return None
+
+ if not raw:
+ return None
+ return raw
+
+
+def canonicalize_element_symbol(token: str | None) -> str | None:
+ """Canonicalize a free-form token to a conventional element symbol.
+
+ The function accepts strings such as ``"cl"``, ``" Cl "`` or
+ ``"Cl12"`` and returns ``"Cl"`` when a leading element-like token can be
+ identified. Missing-value markers and non-element strings return ``None``.
+ """
+
+ raw = _normalize_element_token(token)
+ if raw is None:
+ return None
+
+ match = _LEADING_ALPHA_RE.match(raw)
+ if match is None:
+ return None
+
+ letters = match.group(1)
+ return letters[0].upper() + letters[1:].lower()
+
+
+@lru_cache(maxsize=1)
+def _load_elements_by_symbol() -> dict[str, Element]:
+ """Load the packaged periodic table into a symbol-keyed mapping."""
+
+ table_path = resources.files("atomref.data").joinpath("periodic_table.csv")
+ with table_path.open("r", encoding="utf-8", newline="") as handle:
+ reader = csv.DictReader(handle)
+ out: dict[str, Element] = {}
+ for row in reader:
+ z = int(row["z"])
+ symbol = row["symbol"]
+ name = row["name"]
+ out[symbol] = Element(z=z, symbol=symbol, name=name)
+ return out
+
+
+@lru_cache(maxsize=1)
+def _elements_in_z_order() -> tuple[Element, ...]:
+ """Return packaged elements sorted by increasing atomic number."""
+
+ return tuple(sorted(_load_elements_by_symbol().values(), key=lambda e: e.z))
+
+
+def is_valid_element_symbol(symbol: str | None) -> bool:
+ """Return ``True`` if ``symbol`` is a known packaged element symbol."""
+
+ if symbol is None:
+ return False
+ return symbol in _load_elements_by_symbol()
+
+
+def get_element(symbol: str | None) -> Element | None:
+ """Look up packaged element identity from a symbol-like token."""
+
+ sym = canonicalize_element_symbol(symbol)
+ if sym is None:
+ return None
+ return _load_elements_by_symbol().get(sym)
+
+
+def iter_elements() -> tuple[Element, ...]:
+ """Return all packaged elements in increasing atomic-number order."""
+
+ return _elements_in_z_order()
diff --git a/src/atomref/errors.py b/src/atomref/errors.py
new file mode 100644
index 0000000..d31660a
--- /dev/null
+++ b/src/atomref/errors.py
@@ -0,0 +1,17 @@
+"""Package-local exceptions used across :mod:`atomref`."""
+
+
+class AtomrefError(Exception):
+ """Base class for package-defined errors."""
+
+
+class DatasetError(AtomrefError):
+ """Raised when packaged data or registry metadata are invalid."""
+
+
+class MissingValueError(AtomrefError):
+ """Raised when a required reference value is unavailable."""
+
+
+class PolicyError(AtomrefError):
+ """Raised for invalid policy configuration or transfer resolution."""
diff --git a/src/atomref/policy.py b/src/atomref/policy.py
new file mode 100644
index 0000000..79cc9f3
--- /dev/null
+++ b/src/atomref/policy.py
@@ -0,0 +1,794 @@
+"""Generic value-policy resolution for element-indexed scalar datasets."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+import contextvars
+from dataclasses import dataclass, field
+from functools import lru_cache
+import math
+from types import MappingProxyType
+from typing import Generic, Literal, TypeVar
+
+from .elements import (
+ canonicalize_element_symbol,
+ is_valid_element_symbol,
+ iter_elements,
+)
+from .errors import PolicyError
+from .registry import (
+ DatasetLike,
+ DatasetRef,
+ ElementScalarSet,
+ _is_placeholder_value,
+ get_builtin_set,
+ resolve_dataset_like,
+)
+from .transfer import (
+ LinearFit,
+ LinearTransfer,
+ SubstitutionTransfer,
+ SupportsValuePolicy,
+ TransferModel,
+)
+
+K = TypeVar("K")
+
+LookupSource = Literal[
+ "override",
+ "base",
+ "transfer_substitution",
+ "transfer_linear",
+ "fallback",
+ "missing",
+]
+
+PolicyToken = tuple[str, int]
+_ACTIVE_POLICY_TOKENS: contextvars.ContextVar[tuple[PolicyToken, ...]] = (
+ contextvars.ContextVar("atomref_active_policy_tokens", default=())
+)
+
+
+@dataclass(frozen=True, slots=True)
+class LookupResult:
+ """Result of resolving one value through a policy.
+
+ ``value`` carries the final scalar value when one could be produced, while
+ ``source`` and the remaining metadata explain how that value was obtained.
+ ``transfer_depth`` counts how many transfer steps were involved in producing
+ the returned value. Direct base and override values therefore have depth 0.
+ """
+
+ value: float | None
+ source: LookupSource
+ target: DatasetRef
+ resolved_from: tuple[DatasetRef, ...] = ()
+ is_placeholder: bool = False
+ fit: LinearFit | None = None
+ notes: tuple[str, ...] = ()
+ transfer_depth: int = 0
+
+ def __float__(self) -> float:
+ """Coerce the resolved value to ``float`` or raise if it is missing."""
+
+ if self.value is None:
+ raise TypeError("reference value is missing")
+ return float(self.value)
+
+
+@dataclass(frozen=True, slots=True)
+class ValuePolicy(Generic[K]):
+ """Ordered rule set for resolving element-domain scalar values.
+
+ The current runtime resolves only element-domain policies even though the
+ metadata layer already records a more general ``domain`` concept. During
+ construction, element-domain override keys are normalized to canonical
+ element symbols and validated as finite floats.
+ """
+
+ base: DatasetLike
+ transfers: tuple[TransferModel, ...] = ()
+ overrides: Mapping[K, float] = field(default_factory=dict)
+ fallback: float | None = None
+ blocked: tuple[str, ...] = ()
+
+ def __post_init__(self) -> None:
+ """Validate and normalize policy configuration eagerly."""
+
+ if self.fallback is not None:
+ object.__setattr__(
+ self,
+ "fallback",
+ _coerce_policy_float(self.fallback, what="policy fallback"),
+ )
+
+ base_set = resolve_dataset_like(self.base)
+ if base_set.info.domain != "element":
+ return
+
+ normalized_blocked: list[str] = []
+ seen_blocked: set[str] = set()
+ for key in self.blocked:
+ if not isinstance(key, str):
+ raise PolicyError(
+ "element-domain blocked keys must be element-symbol strings"
+ )
+ sym = _normalize_element_symbol(key)
+ if sym is None:
+ raise PolicyError(f"invalid blocked element symbol: {key!r}")
+ if sym not in seen_blocked:
+ normalized_blocked.append(sym)
+ seen_blocked.add(sym)
+ object.__setattr__(self, "blocked", tuple(normalized_blocked))
+
+ normalized_overrides: dict[str, float] = {}
+ seen_original_keys: dict[str, str] = {}
+ for key, value in self.overrides.items():
+ if not isinstance(key, str):
+ raise PolicyError(
+ "element-domain policy overrides must be keyed by element "
+ "symbols"
+ )
+ sym = _normalize_element_symbol(key)
+ if sym is None:
+ raise PolicyError(f"invalid override element symbol: {key!r}")
+ if sym in seen_blocked:
+ raise PolicyError(f"override key {key!r} is blocked in this policy")
+ previous = seen_original_keys.get(sym)
+ if previous is not None and previous != key:
+ raise PolicyError(
+ f"override keys {previous!r} and {key!r} both normalize to "
+ f"{sym!r}"
+ )
+ seen_original_keys[sym] = key
+ normalized_overrides[sym] = _coerce_policy_float(
+ value,
+ what=f"override value for {key!r}",
+ )
+
+ object.__setattr__(
+ self,
+ "overrides",
+ MappingProxyType(normalized_overrides),
+ )
+
+
+@dataclass(frozen=True, slots=True)
+class _ResolvedElementSource:
+ """Internal representation of an element-domain transfer source."""
+
+ ref: DatasetRef
+ values_by_z: tuple[float | None, ...]
+ placeholder_by_z: tuple[bool, ...]
+ lookup_source_by_z: tuple[LookupSource | None, ...]
+ transfer_depth_by_z: tuple[int | None, ...]
+ via_policy: bool = False
+
+
+@dataclass(frozen=True, slots=True)
+class _TransferSourceValue:
+ """Internal representation of one value obtained from a transfer source."""
+
+ value: float
+ ref: DatasetRef
+ resolved_from: tuple[DatasetRef, ...]
+ is_placeholder: bool
+ via_policy: bool = False
+ lookup_source: LookupSource | None = None
+ notes: tuple[str, ...] = ()
+ transfer_depth: int = 0
+
+
+def _coerce_policy_float(value: object, *, what: str) -> float:
+ """Return a finite float for policy configuration values."""
+
+ try:
+ out = float(value)
+ except (TypeError, ValueError) as exc:
+ raise PolicyError(f"{what} must be a finite float") from exc
+ if not math.isfinite(out):
+ raise PolicyError(f"{what} must be a finite float")
+ return out
+
+
+def _normalize_element_symbol(symbol: str | None) -> str | None:
+ """Normalize user input to a packaged element symbol.
+
+ The current resolver treats ``D`` and ``T`` as hydrogen aliases.
+ """
+
+ cand = canonicalize_element_symbol(symbol)
+ if cand in {"D", "T"}:
+ cand = "H"
+ if cand is None:
+ return None
+ if not is_valid_element_symbol(cand):
+ return None
+ return cand
+
+
+def _resolve_target_ref(policy: ValuePolicy[object]) -> DatasetRef:
+ """Return the target dataset reference implied by a policy base."""
+
+ return resolve_dataset_like(policy.base).ref
+
+
+def _policy_resolution_tokens(
+ policy: ValuePolicy[object],
+ *,
+ owner: object | None = None,
+) -> tuple[PolicyToken, ...]:
+ """Return all tokens that should be considered active for one resolution.
+
+ We always track the concrete :class:`ValuePolicy` object identity. When a
+ wrapper object such as :class:`atomref.radii.RadiiPolicy` or
+ :class:`atomref.xh.XHPolicy` is the logical source, we also track the
+ wrapper identity so recursion through freshly materialized generic policies
+ is still detected.
+ """
+
+ tokens: list[PolicyToken] = [("policy", id(policy))]
+ if owner is not None:
+ tokens.append((f"owner:{type(owner).__qualname__}", id(owner)))
+ return tuple(tokens)
+
+
+def _lookup_value_with_owner(
+ symbol: str | None,
+ *,
+ policy: ValuePolicy[str],
+ owner: object | None,
+) -> LookupResult:
+ """Internal lookup helper that carries wrapper identity for cycle checks."""
+
+ return _resolve_value(symbol, policy=policy, resolution_owner=owner)
+
+
+def _coerce_nested_policy(
+ source: object,
+) -> tuple[ValuePolicy[str] | None, object | None]:
+ """Return ``source`` as a generic value policy and its logical owner."""
+
+ if isinstance(source, ValuePolicy):
+ return source, None
+ if isinstance(source, SupportsValuePolicy):
+ nested = source.as_value_policy()
+ if not isinstance(nested, ValuePolicy):
+ raise PolicyError("policy-like transfer sources must return ValuePolicy")
+ return nested, source
+ return None, None
+
+
+def _materialize_transfer_source(
+ source: DatasetLike | SupportsValuePolicy | ValuePolicy[str],
+) -> _ResolvedElementSource:
+ """Materialize any element-domain transfer source into dense by-Z arrays."""
+
+ nested_policy, nested_owner = _coerce_nested_policy(source)
+ if nested_policy is None:
+ dataset = resolve_dataset_like(source)
+ placeholders = tuple(
+ False
+ if value is None
+ else _is_placeholder_value(dataset.info, float(value))
+ for value in dataset.values_by_z
+ )
+ lookup_sources = tuple(
+ "base" if value is not None else None for value in dataset.values_by_z
+ )
+ transfer_depths = tuple(
+ 0 if value is not None else None for value in dataset.values_by_z
+ )
+ return _ResolvedElementSource(
+ ref=dataset.ref,
+ values_by_z=dataset.values_by_z,
+ placeholder_by_z=placeholders,
+ lookup_source_by_z=lookup_sources,
+ transfer_depth_by_z=transfer_depths,
+ via_policy=False,
+ )
+
+ target = _resolve_target_ref(nested_policy)
+ n_z = max(elem.z for elem in iter_elements())
+ values: list[float | None] = [None] * (n_z + 1)
+ placeholders: list[bool] = [False] * (n_z + 1)
+ lookup_sources: list[LookupSource | None] = [None] * (n_z + 1)
+ transfer_depths: list[int | None] = [None] * (n_z + 1)
+ for elem in iter_elements():
+ lookup = _lookup_value_with_owner(
+ elem.symbol,
+ policy=nested_policy,
+ owner=nested_owner,
+ )
+ values[elem.z] = lookup.value
+ if lookup.value is not None:
+ placeholders[elem.z] = lookup.is_placeholder
+ lookup_sources[elem.z] = lookup.source
+ transfer_depths[elem.z] = lookup.transfer_depth
+ return _ResolvedElementSource(
+ ref=target,
+ values_by_z=tuple(values),
+ placeholder_by_z=tuple(placeholders),
+ lookup_source_by_z=tuple(lookup_sources),
+ transfer_depth_by_z=tuple(transfer_depths),
+ via_policy=True,
+ )
+
+
+def _lookup_transfer_source_value(
+ symbol: str,
+ source: DatasetLike | SupportsValuePolicy | ValuePolicy[str],
+) -> tuple[_TransferSourceValue | None, str | None]:
+ """Resolve one element value from a transfer source or nested policy."""
+
+ nested_policy, nested_owner = _coerce_nested_policy(source)
+ if nested_policy is None:
+ source_set = resolve_dataset_like(source)
+ value = source_set.get(symbol)
+ if value is None:
+ return None, f"no value in {source_set.ref.set_id}"
+ value_f = float(value)
+ return (
+ _TransferSourceValue(
+ value=value_f,
+ ref=source_set.ref,
+ resolved_from=(source_set.ref,),
+ is_placeholder=_is_placeholder_value(source_set.info, value_f),
+ via_policy=False,
+ lookup_source="base",
+ notes=(),
+ transfer_depth=0,
+ ),
+ None,
+ )
+
+ lookup = _lookup_value_with_owner(
+ symbol,
+ policy=nested_policy,
+ owner=nested_owner,
+ )
+ if lookup.value is None:
+ if lookup.notes:
+ return (
+ None,
+ "policy source returned no value: " + "; ".join(lookup.notes),
+ )
+ return None, "policy source returned no value"
+
+ return (
+ _TransferSourceValue(
+ value=float(lookup.value),
+ ref=_resolve_target_ref(nested_policy),
+ resolved_from=lookup.resolved_from,
+ is_placeholder=lookup.is_placeholder,
+ via_policy=True,
+ lookup_source=lookup.source,
+ notes=lookup.notes,
+ transfer_depth=lookup.transfer_depth,
+ ),
+ None,
+ )
+
+
+def _transfer_source_is_allowed(
+ lookup_source: LookupSource | None,
+ transfer_depth: int | None,
+ *,
+ allowed_sources: tuple[str, ...],
+ max_depth: int,
+) -> bool:
+ """Return whether a nested predictor value may participate downstream."""
+
+ if lookup_source is None or transfer_depth is None:
+ return False
+ return lookup_source in allowed_sources and transfer_depth <= max_depth
+
+
+def _explain_rejected_transfer_source(
+ *,
+ source_role: str,
+ lookup_source: LookupSource | None,
+ transfer_depth: int | None,
+ allowed_sources: tuple[str, ...],
+ max_depth: int,
+) -> str:
+ """Return a human-readable explanation for a rejected nested source."""
+
+ if lookup_source is None or transfer_depth is None:
+ return f"{source_role} policy source did not return a usable value"
+ if lookup_source not in allowed_sources:
+ allowed = ", ".join(allowed_sources)
+ return (
+ f"{source_role} policy source resolved via {lookup_source}, which is "
+ f"excluded by {source_role}_sources=({allowed})"
+ )
+ return (
+ f"{source_role} policy source transfer depth {transfer_depth} exceeds "
+ f"allowed maximum {max_depth} ({source_role}_max_depth)"
+ )
+
+
+def _fit_linear_transfer(
+ base_set: ElementScalarSet,
+ predictor_source: _ResolvedElementSource,
+ *,
+ min_points: int,
+ exclude_placeholders: bool,
+ fit_sources: tuple[str, ...],
+ fit_max_depth: int,
+) -> LinearFit:
+ """Fit a one-predictor linear transfer model between two sources."""
+
+ xs: list[float] = []
+ ys: list[float] = []
+ filtered_by_fit_restrictions = 0
+
+ n_z = min(len(base_set.values_by_z), len(predictor_source.values_by_z))
+ for z in range(1, n_z):
+ y = base_set.values_by_z[z]
+ x = predictor_source.values_by_z[z]
+ if y is None or x is None:
+ continue
+ if not _transfer_source_is_allowed(
+ predictor_source.lookup_source_by_z[z],
+ predictor_source.transfer_depth_by_z[z],
+ allowed_sources=fit_sources,
+ max_depth=fit_max_depth,
+ ):
+ filtered_by_fit_restrictions += 1
+ continue
+ y_f = float(y)
+ x_f = float(x)
+ if exclude_placeholders and (
+ _is_placeholder_value(base_set.info, y_f)
+ or predictor_source.placeholder_by_z[z]
+ ):
+ continue
+ xs.append(x_f)
+ ys.append(y_f)
+
+ n = len(xs)
+ if n < min_points:
+ if predictor_source.via_policy and filtered_by_fit_restrictions > 0:
+ raise PolicyError(
+ "not enough overlapping elements to fit linear transfer after "
+ "applying fit source constraints (fit-source restrictions)"
+ )
+ raise PolicyError("not enough overlapping elements to fit linear transfer")
+
+ x_mean = sum(xs) / n
+ y_mean = sum(ys) / n
+ sxx = sum((x - x_mean) ** 2 for x in xs)
+ if sxx == 0:
+ raise PolicyError("cannot fit linear transfer: zero predictor variance")
+
+ sxy = sum((x - x_mean) * (y - y_mean) for x, y in zip(xs, ys))
+ slope = sxy / sxx
+ intercept = y_mean - slope * x_mean
+
+ y_hat = [slope * x + intercept for x in xs]
+ sse = sum((y - yh) ** 2 for y, yh in zip(ys, y_hat))
+ sst = sum((y - y_mean) ** 2 for y in ys)
+ r2 = 1.0 - sse / sst if sst != 0 else 1.0
+ rmse = math.sqrt(sse / n)
+
+ return LinearFit(
+ coefficients=(slope,),
+ intercept=intercept,
+ n_points=n,
+ r2=r2,
+ rmse=rmse,
+ )
+
+
+@lru_cache(maxsize=None)
+def _fit_linear_transfer_cached(
+ base_ref: DatasetRef,
+ predictor_ref: DatasetRef,
+ min_points: int,
+ exclude_placeholders: bool,
+ fit_sources: tuple[str, ...],
+ fit_max_depth: int,
+) -> LinearFit:
+ """Cache fits between two packaged datasets for repeated reuse."""
+
+ return _fit_linear_transfer(
+ get_builtin_set(base_ref),
+ _materialize_transfer_source(predictor_ref),
+ min_points=min_points,
+ exclude_placeholders=exclude_placeholders,
+ fit_sources=fit_sources,
+ fit_max_depth=fit_max_depth,
+ )
+
+
+def _fit_transfer_model(base: DatasetLike, transfer: TransferModel) -> LinearFit | None:
+ """Return the fit object for a transfer model when it needs one."""
+
+ if not isinstance(transfer, LinearTransfer):
+ return None
+ if len(transfer.predictors) != 1:
+ raise PolicyError(
+ "LinearTransfer currently supports exactly one predictor source"
+ )
+
+ predictor = transfer.predictors[0]
+ if isinstance(base, DatasetRef) and isinstance(predictor, DatasetRef):
+ return _fit_linear_transfer_cached(
+ base,
+ predictor,
+ transfer.min_points,
+ transfer.exclude_placeholders,
+ transfer.fit_sources,
+ transfer.fit_max_depth,
+ )
+ return _fit_linear_transfer(
+ resolve_dataset_like(base),
+ _materialize_transfer_source(predictor),
+ min_points=transfer.min_points,
+ exclude_placeholders=transfer.exclude_placeholders,
+ fit_sources=transfer.fit_sources,
+ fit_max_depth=transfer.fit_max_depth,
+ )
+
+
+def _apply_substitution_transfer(
+ symbol: str,
+ *,
+ target: DatasetRef,
+ transfer: SubstitutionTransfer,
+) -> tuple[LookupResult | None, str | None]:
+ """Try to resolve ``symbol`` by direct substitution from another source."""
+
+ source_value, note = _lookup_transfer_source_value(symbol, transfer.source)
+ if source_value is None:
+ return None, note
+
+ notes = [
+ "missing in base set; substituted from policy source"
+ if source_value.via_policy
+ else "missing in base set; substituted from transfer source"
+ ]
+ if source_value.via_policy and source_value.lookup_source not in (None, "base"):
+ notes.append(
+ f"policy source resolved the value via {source_value.lookup_source}"
+ )
+ if source_value.is_placeholder:
+ notes.append("transfer source value is marked as a placeholder")
+ return (
+ LookupResult(
+ value=source_value.value,
+ source="transfer_substitution",
+ target=target,
+ resolved_from=source_value.resolved_from,
+ is_placeholder=source_value.is_placeholder,
+ notes=tuple(notes),
+ transfer_depth=source_value.transfer_depth + 1,
+ ),
+ None,
+ )
+
+
+def _apply_linear_transfer(
+ symbol: str,
+ *,
+ base: DatasetLike,
+ target: DatasetRef,
+ transfer: LinearTransfer,
+) -> tuple[LookupResult | None, str | None]:
+ """Try to resolve ``symbol`` through linear transfer from predictor data."""
+
+ if len(transfer.predictors) != 1:
+ raise PolicyError(
+ "LinearTransfer currently supports exactly one predictor source"
+ )
+
+ predictor_value, note = _lookup_transfer_source_value(
+ symbol,
+ transfer.predictors[0],
+ )
+ if predictor_value is None:
+ return None, note
+
+ if not _transfer_source_is_allowed(
+ predictor_value.lookup_source,
+ predictor_value.transfer_depth,
+ allowed_sources=transfer.prediction_sources,
+ max_depth=transfer.prediction_max_depth,
+ ):
+ return (
+ None,
+ _explain_rejected_transfer_source(
+ source_role="prediction",
+ lookup_source=predictor_value.lookup_source,
+ transfer_depth=predictor_value.transfer_depth,
+ allowed_sources=transfer.prediction_sources,
+ max_depth=transfer.prediction_max_depth,
+ ),
+ )
+
+ if transfer.exclude_placeholders and predictor_value.is_placeholder:
+ if predictor_value.via_policy:
+ return None, "predictor value from policy source is a placeholder"
+ return None, f"predictor value in {predictor_value.ref.set_id} is a placeholder"
+
+ fit = _fit_transfer_model(base, transfer)
+ if fit is None:
+ return None, "no fit available for linear transfer"
+ predicted = fit.coefficients[0] * predictor_value.value + fit.intercept
+
+ notes = ["missing in base set; inferred via linear transfer"]
+ if predictor_value.via_policy:
+ notes.append("predictor value supplied by policy source")
+ notes.append(
+ "linear fit applied fit-source and transfer-depth limits to "
+ "policy-materialized predictor values"
+ )
+ if predictor_value.lookup_source not in (None, "base"):
+ notes.append(
+ "policy predictor resolved the value via "
+ f"{predictor_value.lookup_source}"
+ )
+
+ return (
+ LookupResult(
+ value=float(predicted),
+ source="transfer_linear",
+ target=target,
+ resolved_from=predictor_value.resolved_from,
+ is_placeholder=False,
+ fit=fit,
+ notes=tuple(notes),
+ transfer_depth=predictor_value.transfer_depth + 1,
+ ),
+ None,
+ )
+
+
+def _resolve_value(
+ symbol: str | None,
+ *,
+ policy: ValuePolicy[str],
+ resolution_owner: object | None = None,
+) -> LookupResult:
+ """Resolve a value through override, base, transfer, and fallback steps."""
+
+ active_tokens = _ACTIVE_POLICY_TOKENS.get()
+ resolution_tokens = _policy_resolution_tokens(policy, owner=resolution_owner)
+ if any(token in active_tokens for token in resolution_tokens):
+ raise PolicyError("cyclic policy resolution detected")
+
+ stack_token = _ACTIVE_POLICY_TOKENS.set(active_tokens + resolution_tokens)
+ try:
+ target = _resolve_target_ref(policy)
+ base_set = resolve_dataset_like(policy.base)
+ if base_set.info.domain != "element":
+ raise PolicyError(
+ "the resolver currently supports only element-domain datasets"
+ )
+
+ sym = _normalize_element_symbol(symbol)
+ if sym is None:
+ note = "unknown element" if symbol is not None else "missing element symbol"
+ return LookupResult(
+ value=None,
+ source="missing",
+ target=target,
+ notes=(note,),
+ )
+
+ if sym in policy.blocked:
+ return LookupResult(
+ value=None,
+ source="missing",
+ target=target,
+ notes=(f"{sym} is blocked by this policy",),
+ )
+
+ if sym in policy.overrides:
+ return LookupResult(
+ value=float(policy.overrides[sym]),
+ source="override",
+ target=target,
+ notes=("value supplied by policy override",),
+ transfer_depth=0,
+ )
+
+ base_value = base_set.get(sym)
+ if base_value is not None:
+ base_f = float(base_value)
+ is_placeholder = _is_placeholder_value(base_set.info, base_f)
+ notes = (
+ ("base dataset value is marked as a placeholder",)
+ if is_placeholder
+ else ()
+ )
+ return LookupResult(
+ value=base_f,
+ source="base",
+ target=target,
+ resolved_from=(base_set.ref,),
+ is_placeholder=is_placeholder,
+ notes=notes,
+ transfer_depth=0,
+ )
+
+ transfer_notes: list[str] = ["missing in base set"]
+ for transfer in policy.transfers:
+ if isinstance(transfer, SubstitutionTransfer):
+ result, note = _apply_substitution_transfer(
+ sym,
+ target=target,
+ transfer=transfer,
+ )
+ elif isinstance(transfer, LinearTransfer):
+ result, note = _apply_linear_transfer(
+ sym,
+ base=policy.base,
+ target=target,
+ transfer=transfer,
+ )
+ else: # pragma: no cover - closed union today
+ raise PolicyError(f"unsupported transfer model: {type(transfer)!r}")
+
+ if result is not None:
+ return result
+ if note:
+ transfer_notes.append(note)
+
+ if policy.fallback is not None:
+ return LookupResult(
+ value=float(policy.fallback),
+ source="fallback",
+ target=target,
+ notes=tuple(transfer_notes + ["using fallback value"]),
+ transfer_depth=0,
+ )
+
+ return LookupResult(
+ value=None,
+ source="missing",
+ target=target,
+ notes=tuple(transfer_notes),
+ )
+ finally:
+ _ACTIVE_POLICY_TOKENS.reset(stack_token)
+
+
+def _lookup_value_from_policy_source(
+ symbol: str | None,
+ *,
+ source: ValuePolicy[str] | SupportsValuePolicy,
+) -> LookupResult:
+ """Resolve a value from either a generic policy or a wrapper policy."""
+
+ if isinstance(source, ValuePolicy):
+ return _lookup_value_with_owner(symbol, policy=source, owner=None)
+ policy = source.as_value_policy()
+ return _lookup_value_with_owner(symbol, policy=policy, owner=source)
+
+
+def _get_value_from_policy_source(
+ symbol: str | None,
+ *,
+ source: ValuePolicy[str] | SupportsValuePolicy,
+) -> float | None:
+ """Return only the scalar selected by a generic or wrapper policy."""
+
+ return _lookup_value_from_policy_source(symbol, source=source).value
+
+
+def lookup_value(symbol: str | None, *, policy: ValuePolicy[str]) -> LookupResult:
+ """Public entry point for generic element-domain scalar lookup.
+
+ This is the same resolver used internally by the radii convenience layer.
+ In the current implementation the runtime supports only element-domain policies.
+ """
+
+ return _lookup_value_with_owner(symbol, policy=policy, owner=None)
+
+
+def get_value(symbol: str | None, *, policy: ValuePolicy[str]) -> float | None:
+ """Return only the resolved scalar value for an element-domain policy."""
+
+ return lookup_value(symbol, policy=policy).value
diff --git a/src/atomref/py.typed b/src/atomref/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/atomref/radii.py b/src/atomref/radii.py
new file mode 100644
index 0000000..b33877f
--- /dev/null
+++ b/src/atomref/radii.py
@@ -0,0 +1,363 @@
+"""Radii-specific public API built on the generic policy core."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+import math
+from typing import Literal
+
+from .elements import canonicalize_element_symbol, get_element, is_valid_element_symbol
+from .errors import PolicyError
+from .policy import (
+ LookupResult,
+ ValuePolicy,
+ _fit_transfer_model,
+ _get_value_from_policy_source,
+ _lookup_value_from_policy_source,
+)
+from .registry import (
+ DatasetInfo,
+ DatasetRef,
+ ElementScalarSet,
+ get_builtin_set,
+ get_dataset_info,
+ list_dataset_ids,
+ list_dataset_infos,
+)
+from .transfer import LinearFit, LinearTransfer, SubstitutionTransfer, TransferModel
+
+RadiiKind = Literal["covalent", "van_der_waals"]
+RadiiSet = ElementScalarSet
+
+
+_KIND_TO_QUANTITY = {
+ "covalent": "covalent_radius",
+ "van_der_waals": "van_der_waals_radius",
+}
+
+
+@dataclass(frozen=True, slots=True)
+class RadiiPolicy:
+ """Policy wrapper specialized for radii lookup.
+
+ ``kind`` determines the target quantity, while the remaining fields mirror
+ the generic :class:`atomref.policy.ValuePolicy` interface.
+ """
+
+ kind: RadiiKind
+ base_set: str | RadiiSet
+ transfers: tuple[TransferModel, ...] = ()
+ overrides: Mapping[str, float] = field(default_factory=dict)
+ fallback: float | None = None
+
+ def as_value_policy(self) -> ValuePolicy[str]:
+ """Convert the radii policy into the generic scalar-value policy."""
+
+ quantity = _quantity_for_kind(self.kind)
+ if isinstance(self.base_set, ElementScalarSet):
+ if self.base_set.ref.quantity != quantity:
+ msg = (
+ f"base_set quantity {self.base_set.ref.quantity!r} "
+ f"is incompatible with radii kind {self.kind!r}"
+ )
+ raise PolicyError(msg)
+ base = self.base_set
+ else:
+ base = DatasetRef(quantity, self.base_set)
+
+ checked_overrides = {
+ key: _coerce_non_negative_radii_value(
+ value,
+ what=f"radii override value for {key!r}",
+ )
+ for key, value in self.overrides.items()
+ }
+ checked_fallback = (
+ None
+ if self.fallback is None
+ else _coerce_non_negative_radii_value(
+ self.fallback,
+ what="radii fallback",
+ )
+ )
+
+ return ValuePolicy(
+ base=base,
+ transfers=self.transfers,
+ overrides=checked_overrides,
+ fallback=checked_fallback,
+ )
+
+
+@dataclass(frozen=True, slots=True)
+class RadiiElementAssessment:
+ """Per-element row in a radii policy assessment report."""
+
+ symbol: str
+ lookup: LookupResult
+
+
+@dataclass(frozen=True, slots=True)
+class RadiiPolicyAssessment:
+ """Summary of how a radii policy behaved over a set of elements."""
+
+ kind: RadiiKind
+ policy: RadiiPolicy
+ elements: tuple[str, ...]
+
+ n_elements: int
+ n_override: int
+ n_base: int
+ n_transfer_substitution: int
+ n_transfer_linear: int
+ n_fallback: int
+ n_missing: int
+ n_placeholders: int
+
+ missing_symbols: tuple[str, ...]
+ placeholder_symbols: tuple[str, ...]
+
+ fits: tuple[LinearFit, ...] = ()
+ warnings: tuple[str, ...] = ()
+ per_element: tuple[RadiiElementAssessment, ...] = ()
+
+
+def _coerce_non_negative_radii_value(value: object, *, what: str) -> float:
+ """Validate a radii-like policy number.
+
+ The generic :class:`atomref.policy.ValuePolicy` accepts any finite scalar.
+ Radii-specific convenience helpers are stricter and reject negative values.
+ """
+
+ try:
+ out = float(value)
+ except (TypeError, ValueError) as exc:
+ raise PolicyError(f"{what} must be a finite float") from exc
+ if not math.isfinite(out):
+ raise PolicyError(f"{what} must be a finite float")
+ if out < 0:
+ raise PolicyError(f"{what} must be non-negative")
+ return out
+
+
+def _quantity_for_kind(kind: RadiiKind) -> str:
+ """Translate public radii kind names into registry quantity ids."""
+
+ try:
+ return _KIND_TO_QUANTITY[kind]
+ except KeyError as exc:
+ raise PolicyError(f"unknown radii kind: {kind!r}") from exc
+
+
+def _normalize_radii_symbol(symbol: str | None) -> str | None:
+ """Normalize symbols accepted by the radii convenience layer."""
+
+ cand = canonicalize_element_symbol(symbol)
+ if cand in {"D", "T"}:
+ cand = "H"
+ return cand
+
+
+def _normalize_assessment_elements(elements: Iterable[str]) -> tuple[str, ...]:
+ """Normalize, validate, deduplicate, and sort assessment element labels."""
+
+ symbols: set[str] = set()
+ for token in elements:
+ sym = _normalize_radii_symbol(token)
+ if sym is None:
+ raise ValueError("missing element symbol")
+ if not is_valid_element_symbol(sym):
+ raise ValueError(f"invalid element symbol: {sym!r}")
+ symbols.add(sym)
+ return tuple(
+ sorted(symbols, key=lambda s: get_element(s).z if get_element(s) else 0)
+ )
+
+
+def list_radii_sets(
+ kind: RadiiKind,
+ *,
+ usage_role: str | None = None,
+) -> tuple[str, ...]:
+ """List packaged radii-set ids for one radii kind."""
+
+ return list_dataset_ids(_quantity_for_kind(kind), usage_role=usage_role)
+
+
+def list_radii_set_infos(
+ kind: RadiiKind,
+ *,
+ usage_role: str | None = None,
+) -> tuple[DatasetInfo, ...]:
+ """Return packaged metadata objects for radii sets of one kind."""
+
+ return list_dataset_infos(_quantity_for_kind(kind), usage_role=usage_role)
+
+
+def get_radii_set_info(kind: RadiiKind, set_id: str) -> DatasetInfo:
+ """Return metadata for one packaged radii set."""
+
+ return get_dataset_info(DatasetRef(_quantity_for_kind(kind), set_id))
+
+
+def get_radii_set(kind: RadiiKind, set_id: str) -> RadiiSet:
+ """Load one packaged radii set as an :class:`ElementScalarSet`."""
+
+ return get_builtin_set(DatasetRef(_quantity_for_kind(kind), set_id))
+
+
+def _validate_policy_kind(policy: RadiiPolicy, *, expected: RadiiKind) -> None:
+ """Raise when a policy is used with the wrong public radii helper."""
+
+ if policy.kind != expected:
+ raise PolicyError(f"expected a {expected!r} radii policy, got {policy.kind!r}")
+
+
+def _lookup_radius(symbol: str | None, *, policy: RadiiPolicy) -> LookupResult:
+ """Shared implementation for radii lookup helpers."""
+
+ return _lookup_value_from_policy_source(symbol, source=policy)
+
+
+def lookup_covalent_radius(
+ symbol: str | None,
+ *,
+ policy: RadiiPolicy | None = None,
+) -> LookupResult:
+ """Resolve a covalent radius together with provenance information."""
+
+ active = DEFAULT_COVALENT_POLICY if policy is None else policy
+ _validate_policy_kind(active, expected="covalent")
+ return _lookup_radius(symbol, policy=active)
+
+
+def get_covalent_radius(
+ symbol: str | None,
+ *,
+ policy: RadiiPolicy | None = None,
+) -> float | None:
+ """Return only the selected covalent-radius value, without provenance."""
+
+ active = DEFAULT_COVALENT_POLICY if policy is None else policy
+ _validate_policy_kind(active, expected="covalent")
+ return _get_value_from_policy_source(symbol, source=active)
+
+
+def lookup_vdw_radius(
+ symbol: str | None,
+ *,
+ policy: RadiiPolicy | None = None,
+) -> LookupResult:
+ """Resolve a van der Waals radius together with provenance information."""
+
+ active = DEFAULT_VDW_POLICY if policy is None else policy
+ _validate_policy_kind(active, expected="van_der_waals")
+ return _lookup_radius(symbol, policy=active)
+
+
+def get_vdw_radius(
+ symbol: str | None,
+ *,
+ policy: RadiiPolicy | None = None,
+) -> float | None:
+ """Return only the selected van der Waals-radius value, without provenance."""
+
+ active = DEFAULT_VDW_POLICY if policy is None else policy
+ _validate_policy_kind(active, expected="van_der_waals")
+ return _get_value_from_policy_source(symbol, source=active)
+
+
+def assess_radii_policy(
+ elements: Iterable[str],
+ *,
+ policy: RadiiPolicy,
+ detail: bool = False,
+) -> RadiiPolicyAssessment:
+ """Assess how a radii policy resolves values over a set of elements."""
+
+ elems = _normalize_assessment_elements(elements)
+ value_policy = policy.as_value_policy()
+
+ n_override = 0
+ n_base = 0
+ n_transfer_substitution = 0
+ n_transfer_linear = 0
+ n_fallback = 0
+ n_missing = 0
+ n_placeholders = 0
+
+ missing_symbols: list[str] = []
+ placeholder_symbols: list[str] = []
+ per_element: list[RadiiElementAssessment] = []
+
+ for symbol in elems:
+ lookup = _lookup_value_from_policy_source(symbol, source=policy)
+ if lookup.source == "override":
+ n_override += 1
+ elif lookup.source == "base":
+ n_base += 1
+ elif lookup.source == "transfer_substitution":
+ n_transfer_substitution += 1
+ elif lookup.source == "transfer_linear":
+ n_transfer_linear += 1
+ elif lookup.source == "fallback":
+ n_fallback += 1
+ elif lookup.source == "missing":
+ n_missing += 1
+ missing_symbols.append(symbol)
+
+ if lookup.is_placeholder:
+ n_placeholders += 1
+ placeholder_symbols.append(symbol)
+
+ if detail:
+ per_element.append(RadiiElementAssessment(symbol=symbol, lookup=lookup))
+
+ fits: list[LinearFit] = []
+ warnings: list[str] = []
+ for transfer in value_policy.transfers:
+ if isinstance(transfer, LinearTransfer):
+ try:
+ fit = _fit_transfer_model(value_policy.base, transfer)
+ except Exception as exc: # noqa: BLE001
+ warnings.append(str(exc))
+ else:
+ if fit is not None:
+ fits.append(fit)
+
+ return RadiiPolicyAssessment(
+ kind=policy.kind,
+ policy=policy,
+ elements=elems,
+ n_elements=len(elems),
+ n_override=n_override,
+ n_base=n_base,
+ n_transfer_substitution=n_transfer_substitution,
+ n_transfer_linear=n_transfer_linear,
+ n_fallback=n_fallback,
+ n_missing=n_missing,
+ n_placeholders=n_placeholders,
+ missing_symbols=tuple(missing_symbols),
+ placeholder_symbols=tuple(placeholder_symbols),
+ fits=tuple(fits),
+ warnings=tuple(warnings),
+ per_element=tuple(per_element),
+ )
+
+
+DEFAULT_COVALENT_POLICY = RadiiPolicy(
+ kind="covalent",
+ base_set="cordero2008",
+ transfers=(
+ SubstitutionTransfer(source=DatasetRef("covalent_radius", "csd_legacy_cov")),
+ ),
+)
+"""Default covalent-radii policy used by the convenience helpers."""
+
+DEFAULT_VDW_POLICY = RadiiPolicy(
+ kind="van_der_waals",
+ base_set="alvarez2013",
+ transfers=(LinearTransfer(predictors=(DatasetRef("atomic_radius", "rahm2016"),)),),
+)
+"""Default vdW-radii policy used by the convenience helpers."""
diff --git a/src/atomref/registry.py b/src/atomref/registry.py
new file mode 100644
index 0000000..b17b941
--- /dev/null
+++ b/src/atomref/registry.py
@@ -0,0 +1,609 @@
+"""Dataset registry and packaged element-scalar set loading."""
+
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass
+import csv
+from functools import lru_cache
+from importlib import resources
+import json
+import math
+from types import MappingProxyType
+import unicodedata
+
+from .elements import canonicalize_element_symbol, get_element, iter_elements
+from .errors import DatasetError
+
+QuantityId = str
+DomainId = str
+
+
+@dataclass(frozen=True, slots=True)
+class DatasetRef:
+ """Stable reference to a packaged dataset.
+
+ The ``quantity`` identifies the operational property family, while
+ ``set_id`` names a specific curated dataset within that family.
+ """
+
+ quantity: QuantityId
+ set_id: str
+
+
+@dataclass(frozen=True, slots=True)
+class Reference:
+ """Bibliographic record attached to packaged dataset metadata."""
+
+ authors: str | None = None
+ year: int | None = None
+ title: str | None = None
+ venue: str | None = None
+ doi: str | None = None
+ url: str | None = None
+ publisher: str | None = None
+ note: str | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class CoverageInfo:
+ """Coverage summary for an element-indexed scalar dataset."""
+
+ n_values: int
+ z_min: int | None = None
+ z_max: int | None = None
+ has_placeholders: bool = False
+ covered_z: tuple[int, ...] = ()
+ missing_z: tuple[int, ...] = ()
+
+
+@dataclass(frozen=True, slots=True)
+class QuantityInfo:
+ """Metadata shared by all datasets that belong to one quantity."""
+
+ quantity: QuantityId
+ domain: DomainId
+ units: str | None = None
+ description: str | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class DatasetInfo:
+ """Curated metadata for one packaged dataset.
+
+ This object keeps operational classification such as ``ref.quantity`` and
+ ``usage_role`` separate from scientific classification such as
+ ``semantic_class`` and ``phase_context``.
+ """
+
+ ref: DatasetRef
+ domain: DomainId
+ units: str | None
+ name: str
+ description: str | None = None
+ usage_role: str | None = None
+ semantic_class: str | None = None
+ origin_class: str | None = None
+ phase_context: str | None = None
+ method_summary: str | None = None
+ placeholder_value: float | None = None
+ extraction_source: str | None = None
+ aliases: tuple[str, ...] = ()
+ references: tuple[Reference, ...] = ()
+ notes: tuple[str, ...] = ()
+ storage: Mapping[str, object] | None = None
+ coverage: CoverageInfo | None = None
+
+
+@dataclass(frozen=True, slots=True)
+class ElementScalarSet:
+ """Element-indexed scalar dataset stored densely by atomic number."""
+
+ ref: DatasetRef
+ info: DatasetInfo
+ values_by_z: tuple[float | None, ...]
+
+ @classmethod
+ def from_mapping(
+ cls,
+ *,
+ ref: DatasetRef,
+ values: Mapping[str, float | None],
+ name: str,
+ units: str | None,
+ description: str | None = None,
+ usage_role: str = "user",
+ semantic_class: str = "user",
+ origin_class: str = "user",
+ phase_context: str | None = None,
+ references: Iterable[Reference] = (),
+ notes: Iterable[str] = (),
+ placeholder_value: float | None = None,
+ ) -> "ElementScalarSet":
+ """Build a custom element-domain dataset from a symbol-keyed mapping."""
+
+ n_z = max(e.z for e in iter_elements())
+ values_by_z: list[float | None] = [None] * (n_z + 1)
+ seen_keys: dict[str, str] = {}
+
+ placeholder_f = (
+ None
+ if placeholder_value is None
+ else _coerce_finite_float(
+ placeholder_value,
+ what=f"placeholder value for custom dataset {ref.set_id!r}",
+ )
+ )
+
+ for key, value in values.items():
+ sym = _normalize_element_domain_symbol(key)
+ elem = get_element(sym)
+ if elem is None:
+ raise DatasetError(f"invalid element symbol in custom set: {key!r}")
+ previous = seen_keys.get(sym)
+ if previous is not None and previous != key:
+ raise DatasetError(
+ "custom-set keys "
+ f"{previous!r} and {key!r} both normalize to {sym!r}"
+ )
+ seen_keys[sym] = key
+ values_by_z[elem.z] = (
+ None
+ if value is None
+ else _coerce_finite_float(
+ value,
+ what=f"value for element {key!r} in custom dataset {ref.set_id!r}",
+ )
+ )
+
+ covered_z = tuple(
+ z for z, value in enumerate(values_by_z) if z > 0 and value is not None
+ )
+ has_placeholders = False
+ if placeholder_f is not None:
+ has_placeholders = any(
+ value is not None and abs(value - placeholder_f) < 1e-12
+ for value in values_by_z[1:]
+ )
+
+ info = DatasetInfo(
+ ref=ref,
+ domain="element",
+ units=units,
+ name=name,
+ description=description,
+ usage_role=usage_role,
+ semantic_class=semantic_class,
+ origin_class=origin_class,
+ phase_context=phase_context,
+ placeholder_value=placeholder_f,
+ aliases=(),
+ references=tuple(references),
+ notes=tuple(notes),
+ storage=None,
+ coverage=CoverageInfo(
+ n_values=len(covered_z),
+ z_min=min(covered_z) if covered_z else None,
+ z_max=max(covered_z) if covered_z else None,
+ has_placeholders=has_placeholders,
+ covered_z=covered_z,
+ missing_z=tuple(z for z in range(1, n_z + 1) if values_by_z[z] is None),
+ ),
+ )
+ return cls(ref=ref, info=info, values_by_z=tuple(values_by_z))
+
+ def get(self, symbol: str | None) -> float | None:
+ """Return the scalar value for ``symbol`` or ``None`` if absent."""
+
+ sym = _normalize_element_domain_symbol(symbol)
+ elem = get_element(sym)
+ if elem is None:
+ return None
+ return self.values_by_z[elem.z]
+
+
+DatasetLike = DatasetRef | ElementScalarSet
+
+
+_DASH_TRANSLATION = str.maketrans(
+ {
+ "‐": "-",
+ "‑": "-",
+ "‒": "-",
+ "–": "-",
+ "—": "-",
+ "―": "-",
+ "−": "-",
+ }
+)
+
+
+def _normalize_element_domain_symbol(symbol: str | None) -> str | None:
+ """Normalize element-domain symbols and fold D/T onto hydrogen."""
+
+ cand = canonicalize_element_symbol(symbol)
+ if cand in {"D", "T"}:
+ return "H"
+ return cand
+
+
+@lru_cache(maxsize=1)
+def _load_registry_json() -> dict[str, object]:
+ """Load the packaged registry JSON as a validated top-level mapping."""
+
+ path = resources.files("atomref.data").joinpath("registry.json")
+ with path.open("r", encoding="utf-8") as handle:
+ data = json.load(handle)
+ if not isinstance(data, dict):
+ raise DatasetError("invalid registry.json: expected JSON object")
+ return data
+
+
+def _freeze_json_like(value: object) -> object:
+ """Recursively freeze JSON-like metadata structures.
+
+ Registry metadata is cached globally. Returning raw dicts or lists from that
+ cache would let callers mutate shared package state through the metadata
+ objects returned by :func:`get_dataset_info`.
+ """
+
+ if isinstance(value, dict):
+ frozen = {str(key): _freeze_json_like(item) for key, item in value.items()}
+ return MappingProxyType(frozen)
+ if isinstance(value, list):
+ return tuple(_freeze_json_like(item) for item in value)
+ return value
+
+
+def _coerce_finite_float(value: object, *, what: str) -> float:
+ """Return ``value`` as a finite float or raise :class:`DatasetError`."""
+
+ try:
+ out = float(value)
+ except (TypeError, ValueError) as exc:
+ raise DatasetError(f"{what} must be a finite float") from exc
+ if not math.isfinite(out):
+ raise DatasetError(f"{what} must be a finite float")
+ return out
+
+
+def _get_quantities_mapping() -> Mapping[str, object]:
+ """Return the raw ``quantities`` mapping from ``registry.json``."""
+
+ quantities = _load_registry_json().get("quantities")
+ if not isinstance(quantities, dict):
+ raise DatasetError("invalid registry.json: missing quantities mapping")
+ return quantities
+
+
+def _get_datasets_mapping() -> Mapping[str, object]:
+ """Return the raw ``datasets`` mapping from ``registry.json``."""
+
+ datasets = _load_registry_json().get("datasets")
+ if not isinstance(datasets, dict):
+ raise DatasetError("invalid registry.json: missing datasets mapping")
+ return datasets
+
+
+def _datasets_for_quantity(quantity: QuantityId) -> Mapping[str, object]:
+ """Return the dataset table for one quantity or raise on unknown input."""
+
+ datasets = _get_datasets_mapping().get(quantity)
+ if not isinstance(datasets, dict):
+ raise DatasetError(f"unknown quantity: {quantity!r}")
+ return datasets
+
+
+def list_quantities() -> tuple[str, ...]:
+ """List packaged quantity identifiers in registry order."""
+
+ return tuple(_get_quantities_mapping().keys())
+
+
+def get_quantity_info(quantity: QuantityId) -> QuantityInfo:
+ """Return quantity-level metadata for a packaged quantity."""
+
+ raw = _get_quantities_mapping().get(quantity)
+ if not isinstance(raw, dict):
+ raise DatasetError(f"unknown quantity: {quantity!r}")
+ domain = raw.get("domain") if isinstance(raw.get("domain"), str) else None
+ if domain is None:
+ raise DatasetError(f"missing domain for quantity: {quantity!r}")
+ units = raw.get("units") if isinstance(raw.get("units"), str) else None
+ description = (
+ raw.get("description") if isinstance(raw.get("description"), str) else None
+ )
+ return QuantityInfo(
+ quantity=quantity,
+ domain=domain,
+ units=units,
+ description=description,
+ )
+
+
+def _canonicalize_alias_token(value: str) -> str:
+ """Normalize a dataset id or alias for case-insensitive comparison."""
+
+ normalized = unicodedata.normalize("NFKC", value)
+ normalized = normalized.translate(_DASH_TRANSLATION)
+ return " ".join(normalized.strip().lower().split())
+
+
+def _resolve_set_id(quantity: QuantityId, set_id: str) -> str:
+ """Resolve a dataset id or alias to its canonical packaged set id."""
+
+ by_quantity = _datasets_for_quantity(quantity)
+ if set_id in by_quantity:
+ return set_id
+
+ wanted = _canonicalize_alias_token(set_id)
+ for actual_id, raw_entry in by_quantity.items():
+ if _canonicalize_alias_token(actual_id) == wanted:
+ return actual_id
+ if isinstance(raw_entry, dict):
+ aliases = raw_entry.get("aliases", ())
+ if isinstance(aliases, list):
+ for alias in aliases:
+ if (
+ isinstance(alias, str)
+ and _canonicalize_alias_token(alias) == wanted
+ ):
+ return actual_id
+ raise DatasetError(f"unknown dataset id for {quantity!r}: {set_id!r}")
+
+
+def list_dataset_ids(
+ quantity: QuantityId, *, usage_role: str | None = None
+) -> tuple[str, ...]:
+ """List packaged dataset identifiers for a quantity.
+
+ When ``usage_role`` is provided, only datasets with a matching normalized
+ role such as ``"target"`` or ``"support"`` are returned.
+ """
+
+ dataset_ids = tuple(_datasets_for_quantity(quantity).keys())
+ if usage_role is None:
+ return dataset_ids
+
+ filtered: list[str] = []
+ wanted = usage_role.strip().lower()
+ for set_id in dataset_ids:
+ info = get_dataset_info(DatasetRef(quantity, set_id))
+ role = (info.usage_role or "").strip().lower()
+ if role == wanted:
+ filtered.append(set_id)
+ return tuple(filtered)
+
+
+def list_dataset_infos(
+ quantity: QuantityId, *, usage_role: str | None = None
+) -> tuple[DatasetInfo, ...]:
+ """Return packaged dataset metadata objects for a quantity."""
+
+ return tuple(
+ get_dataset_info(DatasetRef(quantity, set_id))
+ for set_id in list_dataset_ids(quantity, usage_role=usage_role)
+ )
+
+
+def _coerce_reference(obj: object) -> Reference:
+ """Coerce a raw registry reference entry into :class:`Reference`."""
+
+ if not isinstance(obj, dict):
+ raise DatasetError("invalid reference entry in registry.json")
+ return Reference(
+ authors=obj.get("authors") if isinstance(obj.get("authors"), str) else None,
+ year=obj.get("year") if isinstance(obj.get("year"), int) else None,
+ title=obj.get("title") if isinstance(obj.get("title"), str) else None,
+ venue=obj.get("venue") if isinstance(obj.get("venue"), str) else None,
+ doi=obj.get("doi") if isinstance(obj.get("doi"), str) else None,
+ url=obj.get("url") if isinstance(obj.get("url"), str) else None,
+ publisher=(
+ obj.get("publisher") if isinstance(obj.get("publisher"), str) else None
+ ),
+ note=obj.get("note") if isinstance(obj.get("note"), str) else None,
+ )
+
+
+def _coerce_coverage(obj: object) -> CoverageInfo | None:
+ """Coerce raw coverage metadata into :class:`CoverageInfo`."""
+
+ if not isinstance(obj, dict):
+ return None
+ covered = obj.get("covered_z")
+ missing = obj.get("missing_z")
+ covered_z = tuple(int(z) for z in covered) if isinstance(covered, list) else ()
+ missing_z = tuple(int(z) for z in missing) if isinstance(missing, list) else ()
+ return CoverageInfo(
+ n_values=int(obj["n_values"]),
+ z_min=int(obj["z_min"]) if isinstance(obj.get("z_min"), int) else None,
+ z_max=int(obj["z_max"]) if isinstance(obj.get("z_max"), int) else None,
+ has_placeholders=bool(obj.get("has_placeholders", False)),
+ covered_z=covered_z,
+ missing_z=missing_z,
+ )
+
+
+def get_dataset_info(ref: DatasetRef) -> DatasetInfo:
+ """Return curated metadata for a packaged dataset reference."""
+
+ actual_set_id = _resolve_set_id(ref.quantity, ref.set_id)
+ actual_ref = DatasetRef(quantity=ref.quantity, set_id=actual_set_id)
+
+ quantities = _get_quantities_mapping()
+ quantity_info = quantities.get(actual_ref.quantity)
+ if not isinstance(quantity_info, dict):
+ raise DatasetError(f"unknown quantity: {actual_ref.quantity!r}")
+
+ units = (
+ quantity_info.get("units")
+ if isinstance(quantity_info.get("units"), str)
+ else None
+ )
+ domain = (
+ quantity_info.get("domain")
+ if isinstance(quantity_info.get("domain"), str)
+ else None
+ )
+ if domain is None:
+ raise DatasetError(f"missing domain for quantity: {actual_ref.quantity!r}")
+
+ raw_entry = _datasets_for_quantity(actual_ref.quantity).get(actual_ref.set_id)
+ if not isinstance(raw_entry, dict):
+ raise DatasetError(f"unknown dataset: {actual_ref}")
+
+ refs_raw = raw_entry.get("references", [])
+ references = (
+ tuple(_coerce_reference(item) for item in refs_raw)
+ if isinstance(refs_raw, list)
+ else ()
+ )
+ aliases_raw = raw_entry.get("aliases", [])
+ aliases = (
+ tuple(item for item in aliases_raw if isinstance(item, str))
+ if isinstance(aliases_raw, list)
+ else ()
+ )
+ notes_raw = raw_entry.get("notes", [])
+ notes = (
+ tuple(item for item in notes_raw if isinstance(item, str))
+ if isinstance(notes_raw, list)
+ else ()
+ )
+ storage = (
+ _freeze_json_like(raw_entry.get("storage"))
+ if isinstance(raw_entry.get("storage"), dict)
+ else None
+ )
+
+ return DatasetInfo(
+ ref=actual_ref,
+ domain=domain,
+ units=units,
+ name=(
+ raw_entry.get("name")
+ if isinstance(raw_entry.get("name"), str)
+ else actual_ref.set_id
+ ),
+ description=(
+ raw_entry.get("description")
+ if isinstance(raw_entry.get("description"), str)
+ else None
+ ),
+ usage_role=(
+ raw_entry.get("usage_role")
+ if isinstance(raw_entry.get("usage_role"), str)
+ else None
+ ),
+ semantic_class=(
+ raw_entry.get("semantic_class")
+ if isinstance(raw_entry.get("semantic_class"), str)
+ else None
+ ),
+ origin_class=(
+ raw_entry.get("origin_class")
+ if isinstance(raw_entry.get("origin_class"), str)
+ else None
+ ),
+ phase_context=(
+ raw_entry.get("phase_context")
+ if isinstance(raw_entry.get("phase_context"), str)
+ else None
+ ),
+ method_summary=(
+ raw_entry.get("method_summary")
+ if isinstance(raw_entry.get("method_summary"), str)
+ else None
+ ),
+ placeholder_value=(
+ _coerce_finite_float(
+ raw_entry["placeholder_value"],
+ what=f"placeholder value for packaged dataset {actual_ref!r}",
+ )
+ if raw_entry.get("placeholder_value") is not None
+ else None
+ ),
+ extraction_source=(
+ raw_entry.get("extraction_source")
+ if isinstance(raw_entry.get("extraction_source"), str)
+ else None
+ ),
+ aliases=aliases,
+ references=references,
+ notes=notes,
+ storage=storage if isinstance(storage, Mapping) else None,
+ coverage=_coerce_coverage(raw_entry.get("coverage")),
+ )
+
+
+@lru_cache(maxsize=None)
+def _load_csv_columns(filename: str) -> dict[str, tuple[float | None, ...]]:
+ """Load all value columns from one packaged dense-by-Z CSV table."""
+
+ path = resources.files("atomref.data").joinpath(filename)
+ with path.open("r", encoding="utf-8", newline="") as handle:
+ reader = csv.DictReader(handle)
+ if reader.fieldnames is None or "z" not in reader.fieldnames:
+ raise DatasetError(f"invalid CSV file: {filename!r}")
+ columns = [name for name in reader.fieldnames if name != "z"]
+ values: dict[str, list[float | None]] = {name: [None] * 119 for name in columns}
+ for row in reader:
+ z_text = row.get("z")
+ if z_text is None:
+ continue
+ z = int(z_text)
+ for name in columns:
+ raw = row.get(name)
+ if raw is None:
+ values[name][z] = None
+ continue
+ raw = raw.strip()
+ values[name][z] = (
+ _coerce_finite_float(
+ raw,
+ what=f"value in {filename!r} column {name!r} for Z={z}",
+ )
+ if raw
+ else None
+ )
+ return {name: tuple(vals) for name, vals in values.items()}
+
+
+@lru_cache(maxsize=None)
+def get_builtin_set(ref: DatasetRef) -> ElementScalarSet:
+ """Load a packaged dataset as an :class:`ElementScalarSet`."""
+
+ info = get_dataset_info(ref)
+ if info.domain != "element":
+ raise DatasetError(
+ f"only element-domain datasets are currently supported: {info.ref!r}"
+ )
+ if not isinstance(info.storage, Mapping):
+ raise DatasetError(f"missing storage metadata for dataset: {info.ref!r}")
+
+ filename = info.storage.get("filename")
+ column = info.storage.get("column")
+ if not isinstance(filename, str) or not isinstance(column, str):
+ raise DatasetError(f"invalid storage metadata for dataset: {info.ref!r}")
+
+ table = _load_csv_columns(filename)
+ if column not in table:
+ raise DatasetError(f"column {column!r} not found in {filename!r}")
+
+ return ElementScalarSet(ref=info.ref, info=info, values_by_z=table[column])
+
+
+def resolve_dataset_like(dataset: DatasetLike) -> ElementScalarSet:
+ """Resolve either a packaged reference or a custom set to a loaded set."""
+
+ if isinstance(dataset, ElementScalarSet):
+ return dataset
+ return get_builtin_set(dataset)
+
+
+def _is_placeholder_value(info: DatasetInfo, value: float) -> bool:
+ """Return ``True`` when ``value`` equals the dataset's placeholder value."""
+
+ if info.placeholder_value is None:
+ return False
+ return abs(value - info.placeholder_value) < 1e-12
diff --git a/src/atomref/transfer.py b/src/atomref/transfer.py
new file mode 100644
index 0000000..9adb0ce
--- /dev/null
+++ b/src/atomref/transfer.py
@@ -0,0 +1,168 @@
+"""Transfer-model configuration types for policy-based lookup."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable
+
+from .errors import PolicyError
+from .registry import DatasetLike
+
+if TYPE_CHECKING: # pragma: no cover - typing only
+ from .policy import ValuePolicy
+
+
+TransferValueSource = Literal[
+ "override",
+ "base",
+ "transfer_substitution",
+ "transfer_linear",
+ "fallback",
+]
+"""Source labels that may be admitted into nested linear-transfer workflows."""
+
+_ALLOWED_TRANSFER_VALUE_SOURCES = frozenset(
+ {
+ "override",
+ "base",
+ "transfer_substitution",
+ "transfer_linear",
+ "fallback",
+ }
+)
+
+_DEFAULT_LINEAR_FIT_SOURCES: tuple[TransferValueSource, ...] = (
+ "base",
+ "override",
+)
+_DEFAULT_LINEAR_PREDICTION_SOURCES: tuple[TransferValueSource, ...] = (
+ "base",
+ "override",
+ "transfer_substitution",
+ "transfer_linear",
+)
+
+
+@runtime_checkable
+class SupportsValuePolicy(Protocol):
+ """Protocol for wrapper objects that can expose a generic value policy."""
+
+ def as_value_policy(self) -> "ValuePolicy[str]":
+ """Return the generic element-domain value policy."""
+
+
+@dataclass(frozen=True, slots=True)
+class LinearFit:
+ """Summary statistics for a fitted linear transfer model.
+
+ Parameters are stored in a compact, serializable form so they can be
+ attached to :class:`atomref.policy.LookupResult` objects and reused in
+ reporting code.
+ """
+
+ coefficients: tuple[float, ...]
+ intercept: float
+ n_points: int
+ r2: float
+ rmse: float
+
+
+@dataclass(frozen=True, slots=True)
+class SubstitutionTransfer:
+ """Use another dataset or policy directly when the base dataset is missing.
+
+ The selected value is copied from the source rather than inferred.
+ """
+
+ source: DatasetLike | SupportsValuePolicy | ValuePolicy[str]
+
+
+@dataclass(frozen=True, slots=True)
+class LinearTransfer:
+ """Infer missing target values from one or more predictor datasets or policies.
+
+ In the current implementation the public API stores predictors as a tuple
+ for forward compatibility, but the runtime intentionally accepts exactly one
+ predictor source.
+
+ For nested policy predictors, two safeguards apply:
+
+ - ``fit_sources`` / ``fit_max_depth`` control which predictor values may be
+ used when fitting the linear model itself;
+ - ``prediction_sources`` / ``prediction_max_depth`` control which nested
+ predictor values may be used for the final requested element.
+
+ The defaults are intentionally conservative for fitting and permissive only
+ enough to allow one additional completion step at prediction time.
+ """
+
+ predictors: tuple[DatasetLike | SupportsValuePolicy | ValuePolicy[str], ...]
+ min_points: int = 2
+ exclude_placeholders: bool = True
+ fit_sources: tuple[TransferValueSource, ...] = _DEFAULT_LINEAR_FIT_SOURCES
+ prediction_sources: tuple[TransferValueSource, ...] = (
+ _DEFAULT_LINEAR_PREDICTION_SOURCES
+ )
+ fit_max_depth: int = 0
+ prediction_max_depth: int = 1
+
+ def __post_init__(self) -> None:
+ """Validate obvious configuration errors eagerly."""
+
+ if not self.predictors:
+ raise PolicyError("LinearTransfer requires at least one predictor")
+ if self.min_points < 2:
+ raise PolicyError("LinearTransfer min_points must be at least 2")
+
+ object.__setattr__(
+ self,
+ "fit_sources",
+ _normalize_transfer_value_sources(
+ self.fit_sources,
+ field_name="fit_sources",
+ ),
+ )
+ object.__setattr__(
+ self,
+ "prediction_sources",
+ _normalize_transfer_value_sources(
+ self.prediction_sources,
+ field_name="prediction_sources",
+ ),
+ )
+
+ if self.fit_max_depth < 0:
+ raise PolicyError("LinearTransfer fit_max_depth must be non-negative")
+ if self.prediction_max_depth < 0:
+ raise PolicyError(
+ "LinearTransfer prediction_max_depth must be non-negative"
+ )
+
+
+TransferModel = SubstitutionTransfer | LinearTransfer
+"""Closed union of transfer models supported by the core resolver."""
+
+
+def _normalize_transfer_value_sources(
+ sources: tuple[str, ...],
+ *,
+ field_name: str,
+) -> tuple[TransferValueSource, ...]:
+ """Validate and deduplicate source-label controls for linear transfers."""
+
+ if not sources:
+ raise PolicyError(f"LinearTransfer {field_name} may not be empty")
+
+ normalized: list[TransferValueSource] = []
+ seen: set[str] = set()
+ for source in sources:
+ if source not in _ALLOWED_TRANSFER_VALUE_SOURCES:
+ allowed = ", ".join(sorted(_ALLOWED_TRANSFER_VALUE_SOURCES))
+ raise PolicyError(
+ f"LinearTransfer {field_name} contains unsupported source "
+ f"{source!r}; allowed values are: {allowed}"
+ )
+ if source not in seen:
+ normalized.append(source)
+ seen.add(source)
+ return tuple(normalized)
diff --git a/src/atomref/xh.py b/src/atomref/xh.py
new file mode 100644
index 0000000..5018d99
--- /dev/null
+++ b/src/atomref/xh.py
@@ -0,0 +1,175 @@
+"""X-H bond-length helpers built on the generic policy core."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+import math
+
+from .elements import canonicalize_element_symbol, is_valid_element_symbol
+from .errors import PolicyError
+from .policy import (
+ LookupResult,
+ ValuePolicy,
+ _get_value_from_policy_source,
+ _lookup_value_from_policy_source,
+)
+from .registry import (
+ DatasetInfo,
+ DatasetRef,
+ ElementScalarSet,
+ get_builtin_set,
+ get_dataset_info,
+ list_dataset_ids,
+ list_dataset_infos,
+)
+from .transfer import LinearTransfer, TransferModel
+
+XHSet = ElementScalarSet
+
+_QUANTITY = "xh_bond_length"
+
+
+@dataclass(frozen=True, slots=True)
+class XHPolicy:
+ """Policy wrapper specialized for parent-element X-H bond lengths.
+
+ The quantity key is fixed to ``"xh_bond_length"`` and uses the parent
+ element ``X`` as the lookup key. ``H`` itself is not considered a valid
+ parent element for this quantity.
+ """
+
+ base_set: str | XHSet
+ transfers: tuple[TransferModel, ...] = ()
+ overrides: Mapping[str, float] = field(default_factory=dict)
+ fallback: float | None = None
+
+ def as_value_policy(self) -> ValuePolicy[str]:
+ """Convert the X-H policy into the generic scalar-value policy."""
+
+ if isinstance(self.base_set, ElementScalarSet):
+ if self.base_set.ref.quantity != _QUANTITY:
+ raise PolicyError(
+ "base_set quantity "
+ f"{self.base_set.ref.quantity!r} is incompatible "
+ "with X-H lookup"
+ )
+ base = self.base_set
+ else:
+ base = DatasetRef(_QUANTITY, self.base_set)
+
+ checked_overrides: dict[str, float] = {}
+ for key, value in self.overrides.items():
+ sym = _normalize_xh_symbol(key)
+ if sym is None or not is_valid_element_symbol(sym):
+ raise PolicyError(f"invalid X-H parent element symbol: {key!r}")
+ if sym == "H":
+ raise PolicyError("H is not a valid parent element for xh_bond_length")
+ checked_overrides[key] = _coerce_non_negative_xh_value(
+ value,
+ what=f"X-H override value for {key!r}",
+ )
+
+ checked_fallback = (
+ None
+ if self.fallback is None
+ else _coerce_non_negative_xh_value(self.fallback, what="X-H fallback")
+ )
+
+ return ValuePolicy(
+ base=base,
+ transfers=self.transfers,
+ overrides=checked_overrides,
+ fallback=checked_fallback,
+ blocked=("H",),
+ )
+
+
+def _coerce_non_negative_xh_value(value: object, *, what: str) -> float:
+ """Validate an X-H-like policy number."""
+
+ try:
+ out = float(value)
+ except (TypeError, ValueError) as exc:
+ raise PolicyError(f"{what} must be a finite float") from exc
+ if not math.isfinite(out):
+ raise PolicyError(f"{what} must be a finite float")
+ if out < 0:
+ raise PolicyError(f"{what} must be non-negative")
+ return out
+
+
+def _normalize_xh_symbol(symbol: str | None) -> str | None:
+ """Normalize symbols accepted by the X-H convenience layer."""
+
+ cand = canonicalize_element_symbol(symbol)
+ if cand in {"D", "T"}:
+ cand = "H"
+ return cand
+
+
+def list_xh_sets(*, usage_role: str | None = None) -> tuple[str, ...]:
+ """List packaged X-H set ids."""
+
+ return list_dataset_ids(_QUANTITY, usage_role=usage_role)
+
+
+def list_xh_set_infos(*, usage_role: str | None = None) -> tuple[DatasetInfo, ...]:
+ """Return packaged metadata objects for X-H sets."""
+
+ return list_dataset_infos(_QUANTITY, usage_role=usage_role)
+
+
+def get_xh_set_info(set_id: str) -> DatasetInfo:
+ """Return metadata for one packaged X-H set."""
+
+ return get_dataset_info(DatasetRef(_QUANTITY, set_id))
+
+
+def get_xh_set(set_id: str) -> XHSet:
+ """Load one packaged X-H set as an :class:`ElementScalarSet`."""
+
+ return get_builtin_set(DatasetRef(_QUANTITY, set_id))
+
+
+def lookup_xh_bond_length(
+ symbol: str | None,
+ *,
+ policy: XHPolicy | None = None,
+) -> LookupResult:
+ """Resolve a parent-element X-H bond length with provenance."""
+
+ active = DEFAULT_XH_POLICY if policy is None else policy
+ lookup = _lookup_value_from_policy_source(symbol, source=active)
+ if lookup.value is None and _normalize_xh_symbol(symbol) == "H":
+ return LookupResult(
+ value=None,
+ source="missing",
+ target=lookup.target,
+ notes=("H is not a valid parent element for xh_bond_length",),
+ )
+ return lookup
+
+
+def get_xh_bond_length(
+ symbol: str | None,
+ *,
+ policy: XHPolicy | None = None,
+) -> float | None:
+ """Return only the selected X-H bond-length value, without provenance."""
+
+ active = DEFAULT_XH_POLICY if policy is None else policy
+ return _get_value_from_policy_source(symbol, source=active)
+
+
+DEFAULT_XH_POLICY = XHPolicy(
+ base_set="csd_legacy_xh_cno",
+ transfers=(
+ LinearTransfer(
+ predictors=(DatasetRef("covalent_radius", "cordero2008"),),
+ min_points=3,
+ exclude_placeholders=True,
+ ),
+ ),
+)
+"""Default X-H policy used by the convenience helpers."""
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..08328a4
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SRC = REPO_ROOT / 'src'
+if str(SRC) not in sys.path:
+ sys.path.insert(0, str(SRC))
diff --git a/tests/elements/test_elements.py b/tests/elements/test_elements.py
new file mode 100644
index 0000000..161b420
--- /dev/null
+++ b/tests/elements/test_elements.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import atomref as ar
+
+
+def test_element_lookup_and_validation() -> None:
+ assert ar.is_valid_element_symbol('C')
+ assert ar.is_valid_element_symbol('cl') is False
+ assert ar.get_element('cl') is not None
+ assert ar.get_element('C').z == 6
+ assert ar.get_element('Xx') is None
+
+
+def test_iter_elements_is_sorted_and_complete() -> None:
+ elems = ar.iter_elements()
+ assert elems[0].symbol == 'H'
+ assert elems[-1].symbol == 'Og'
+ assert elems[0].z == 1
+ assert elems[-1].z == 118
diff --git a/tests/meta/test_imports.py b/tests/meta/test_imports.py
new file mode 100644
index 0000000..66210e7
--- /dev/null
+++ b/tests/meta/test_imports.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import importlib
+
+
+MODULES = [
+ 'atomref',
+ 'atomref.elements',
+ 'atomref.registry',
+ 'atomref.transfer',
+ 'atomref.policy',
+ 'atomref.radii',
+ 'atomref.xh',
+]
+
+
+def test_imports() -> None:
+ for name in MODULES:
+ importlib.import_module(name)
diff --git a/tests/meta/test_notebooks.py b/tests/meta/test_notebooks.py
new file mode 100644
index 0000000..d420476
--- /dev/null
+++ b/tests/meta/test_notebooks.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from pathlib import Path
+import subprocess
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CHECK_SCRIPT = REPO_ROOT / "tools" / "check_notebooks.py"
+EXPORT_SCRIPT = REPO_ROOT / "tools" / "export_notebooks.py"
+NOTEBOOKS = REPO_ROOT / "notebooks"
+EXPORTED_NOTEBOOKS = REPO_ROOT / "docs" / "notebooks"
+
+
+def test_notebook_files_exist() -> None:
+ expected = {
+ "01-quickstart.ipynb",
+ "02-policies-and-assessment.ipynb",
+ "03-custom-sets-and-discovery.ipynb",
+ }
+ actual = {path.name for path in NOTEBOOKS.glob("*.ipynb")}
+ assert expected.issubset(actual)
+
+
+def test_notebooks_validate_and_execute() -> None:
+ subprocess.run([sys.executable, str(CHECK_SCRIPT)], cwd=REPO_ROOT, check=True)
+
+
+def test_exported_notebook_pages_are_in_sync() -> None:
+ expected = {
+ "01-quickstart.md",
+ "02-policies-and-assessment.md",
+ "03-custom-sets-and-discovery.md",
+ }
+ actual = {path.name for path in EXPORTED_NOTEBOOKS.glob("*.md")}
+ assert expected.issubset(actual)
+ subprocess.run(
+ [sys.executable, str(EXPORT_SCRIPT), "--check"],
+ cwd=REPO_ROOT,
+ check=True,
+ )
diff --git a/tests/meta/test_package_data.py b/tests/meta/test_package_data.py
new file mode 100644
index 0000000..a9a7e61
--- /dev/null
+++ b/tests/meta/test_package_data.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from importlib import resources
+import json
+
+
+def test_packaged_data_files_are_available() -> None:
+ data_root = resources.files('atomref.data')
+ for name in (
+ 'periodic_table.csv',
+ 'covalent.csv',
+ 'van_der_waals.csv',
+ 'registry.json',
+ 'xh_bond_length.csv',
+ ):
+ assert data_root.joinpath(name).is_file(), name
+
+
+def test_packaged_registry_keeps_atomic_support_classification() -> None:
+ data_root = resources.files('atomref.data')
+ raw = json.loads(data_root.joinpath('registry.json').read_text(encoding='utf-8'))
+
+ assert 'atomic_radius' in raw['datasets']
+ assert 'xh_bond_length' in raw['datasets']
+ rahm = raw['datasets']['atomic_radius']['rahm2016']
+ assert rahm['usage_role'] == 'support'
+ assert rahm['semantic_class'] == 'atomic_isodensity'
+ assert rahm['phase_context'] == 'isolated_atom'
diff --git a/tests/meta/test_public_api.py b/tests/meta/test_public_api.py
new file mode 100644
index 0000000..f3583a1
--- /dev/null
+++ b/tests/meta/test_public_api.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import atomref as ar
+
+
+REQUIRED_PUBLIC_NAMES = {
+ 'Element',
+ 'DatasetRef',
+ 'DatasetInfo',
+ 'ElementScalarSet',
+ 'QuantityInfo',
+ 'LookupResult',
+ 'RadiiPolicy',
+ 'DEFAULT_COVALENT_POLICY',
+ 'DEFAULT_VDW_POLICY',
+ 'LinearTransfer',
+ 'SubstitutionTransfer',
+ 'get_builtin_set',
+ 'get_radii_set',
+ 'get_covalent_radius',
+ 'lookup_covalent_radius',
+ 'get_vdw_radius',
+ 'lookup_vdw_radius',
+ 'XHPolicy',
+ 'DEFAULT_XH_POLICY',
+ 'get_xh_set',
+ 'get_xh_bond_length',
+ 'lookup_xh_bond_length',
+ 'list_xh_sets',
+ 'list_xh_set_infos',
+ 'list_quantities',
+ 'list_dataset_ids',
+ 'list_dataset_infos',
+ 'list_radii_sets',
+ 'list_radii_set_infos',
+}
+
+
+def test___all___exports_existing_objects() -> None:
+ for name in ar.__all__:
+ assert hasattr(ar, name), name
+
+
+def test_core_public_api_names_are_exported() -> None:
+ assert REQUIRED_PUBLIC_NAMES.issubset(set(ar.__all__))
diff --git a/tests/meta/test_readme_sync.py b/tests/meta/test_readme_sync.py
new file mode 100644
index 0000000..fe56ac2
--- /dev/null
+++ b/tests/meta/test_readme_sync.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from pathlib import Path
+import subprocess
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+README = REPO_ROOT / 'README.md'
+SCRIPT = REPO_ROOT / 'tools' / 'gen_readme.py'
+
+
+def test_readme_is_in_sync(tmp_path: Path) -> None:
+ generated = tmp_path / 'README.generated.md'
+ subprocess.run(
+ [sys.executable, str(SCRIPT), '--output', str(generated)],
+ cwd=REPO_ROOT,
+ check=True,
+ )
+ assert generated.read_text(encoding='utf-8') == README.read_text(encoding='utf-8')
diff --git a/tests/meta/test_registry_integrity.py b/tests/meta/test_registry_integrity.py
new file mode 100644
index 0000000..a32b44c
--- /dev/null
+++ b/tests/meta/test_registry_integrity.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import asdict
+
+import atomref as ar
+from atomref.registry import _canonicalize_alias_token, get_builtin_set
+
+_ALLOWED_USAGE_ROLES = {"target", "support"}
+
+
+def test_dataset_aliases_are_unique_within_each_quantity() -> None:
+ for quantity in ar.list_quantities():
+ seen: dict[str, str] = {}
+ for set_id in ar.list_dataset_ids(quantity):
+ info = ar.get_dataset_info(ar.DatasetRef(quantity, set_id))
+ for token in (set_id, *info.aliases):
+ key = _canonicalize_alias_token(token)
+ previous = seen.get(key)
+ assert previous in (None, set_id)
+ seen[key] = set_id
+
+
+def test_every_built_in_dataset_loads_and_matches_coverage_metadata() -> None:
+ for quantity in ar.list_quantities():
+ quantity_info = ar.get_quantity_info(quantity)
+ for set_id in ar.list_dataset_ids(quantity):
+ ref = ar.DatasetRef(quantity, set_id)
+ info = ar.get_dataset_info(ref)
+ dataset = get_builtin_set(ref)
+
+ assert info.domain == quantity_info.domain
+ assert info.units == quantity_info.units
+ assert info.usage_role in _ALLOWED_USAGE_ROLES
+ assert info.references
+ assert info.coverage is not None
+
+ max_z = (
+ info.coverage.z_max
+ if info.coverage.z_max is not None
+ else len(dataset.values_by_z) - 1
+ )
+ covered_z = tuple(
+ z
+ for z, value in enumerate(dataset.values_by_z)
+ if z > 0 and value is not None and z <= max_z
+ )
+ covered_set = set(covered_z)
+ missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set)
+ has_placeholders = info.placeholder_value is not None and any(
+ value is not None and abs(value - info.placeholder_value) < 1e-12
+ for value in dataset.values_by_z[1 : max_z + 1]
+ )
+
+ coverage = asdict(info.coverage)
+ assert coverage["n_values"] == len(covered_z)
+ assert coverage["z_min"] == (min(covered_z) if covered_z else None)
+ assert coverage["z_max"] == (max(covered_z) if covered_z else None)
+ assert coverage["has_placeholders"] is has_placeholders
+ if coverage["covered_z"]:
+ assert tuple(coverage["covered_z"]) == covered_z
+ if coverage["missing_z"]:
+ assert tuple(coverage["missing_z"]) == missing_z
+
+
+def test_non_atomic_quantities_have_at_least_one_target_dataset() -> None:
+ by_role: dict[str, list[str]] = defaultdict(list)
+ for quantity in ar.list_quantities():
+ for set_id in ar.list_dataset_ids(quantity):
+ role = ar.get_dataset_info(ar.DatasetRef(quantity, set_id)).usage_role
+ assert role is not None
+ by_role[role].append(quantity)
+
+ for quantity in ar.list_quantities():
+ if quantity != "atomic_radius":
+ assert quantity in by_role["target"]
diff --git a/tests/meta/test_release_tools.py b/tests/meta/test_release_tools.py
new file mode 100644
index 0000000..7cbff90
--- /dev/null
+++ b/tests/meta/test_release_tools.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+# Keeping this as a subprocess test ensures the helper stays importable and
+# exposes a stable CLI entry point without running the expensive full release
+# workflow inside the unit test suite.
+def test_release_check_help() -> None:
+ result = subprocess.run(
+ [sys.executable, "tools/release_check.py", "--help"],
+ cwd=REPO_ROOT,
+ check=True,
+ capture_output=True,
+ text=True,
+ )
+ assert "release-preparation checks" in result.stdout
diff --git a/tests/meta/test_text_generation_tools.py b/tests/meta/test_text_generation_tools.py
new file mode 100644
index 0000000..b6203a7
--- /dev/null
+++ b/tests/meta/test_text_generation_tools.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+MODULE_PATH = REPO_ROOT / "tools" / "export_notebooks.py"
+
+spec = importlib.util.spec_from_file_location("export_notebooks_tool", MODULE_PATH)
+assert spec is not None and spec.loader is not None
+export_notebooks = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = export_notebooks
+spec.loader.exec_module(export_notebooks)
+
+
+def test_export_notebooks_check_ignores_crlf(tmp_path: Path) -> None:
+ """Notebook export checks should ignore Windows vs Unix newline differences."""
+
+ output_dir = tmp_path / "docs"
+ output_dir.mkdir()
+
+ for notebook_name, output_name in export_notebooks.NOTEBOOK_OUTPUTS.items():
+ rendered = export_notebooks._export_markdown(
+ export_notebooks.NOTEBOOKS / notebook_name
+ )
+ (output_dir / output_name).write_text(
+ rendered.replace("\n", "\r\n"),
+ encoding="utf-8",
+ newline="",
+ )
+
+ assert export_notebooks.export_notebooks(output_dir, check=True) == 0
diff --git a/tests/policy/test_policy.py b/tests/policy/test_policy.py
new file mode 100644
index 0000000..618829a
--- /dev/null
+++ b/tests/policy/test_policy.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import pytest
+
+import atomref as ar
+from atomref.errors import PolicyError
+
+
+def _make_custom_set(
+ quantity: str,
+ set_id: str,
+ values: dict[str, float | None],
+) -> ar.ElementScalarSet:
+ return ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef(quantity, set_id),
+ values=values,
+ name=set_id,
+ units='angstrom',
+ )
+
+
+def _make_partial_covalent_policy(*, include_o: bool) -> ar.RadiiPolicy:
+ values = {
+ 'C': 0.76,
+ 'N': 0.71,
+ }
+ if include_o:
+ values['O'] = 0.66
+ custom = ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef('covalent_radius', 'demo_partial_cov'),
+ values=values,
+ name='Demo partial covalent set',
+ units='angstrom',
+ )
+ return ar.RadiiPolicy(
+ kind='covalent',
+ base_set=custom,
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),),
+ min_points=2,
+ exclude_placeholders=True,
+ ),
+ ),
+ )
+
+
+@dataclass
+class _DemoPolicyWrapper:
+ base: ar.ElementScalarSet
+ source: object | None = None
+
+ def as_value_policy(self) -> ar.ValuePolicy[str]:
+ transfers = ()
+ if self.source is not None:
+ transfers = (ar.SubstitutionTransfer(source=self.source),)
+ return ar.ValuePolicy(base=self.base, transfers=transfers)
+
+
+def test_lookup_value_is_public_generic_entry_point() -> None:
+ policy = ar.ValuePolicy(
+ base=ar.DatasetRef('covalent_radius', 'cordero2008'),
+ overrides={'d': 0.5},
+ )
+ lookup = ar.lookup_value('H', policy=policy)
+ assert lookup.source == 'override'
+ assert lookup.value == pytest.approx(0.5)
+ assert lookup.transfer_depth == 0
+
+
+def test_get_value_returns_only_scalar() -> None:
+ policy = ar.ValuePolicy(base=ar.DatasetRef('covalent_radius', 'cordero2008'))
+ assert ar.get_value('C', policy=policy) == pytest.approx(0.76)
+
+
+def test_value_policy_rejects_normalized_override_collisions() -> None:
+ with pytest.raises(PolicyError):
+ ar.ValuePolicy(
+ base=ar.DatasetRef('covalent_radius', 'cordero2008'),
+ overrides={'H': 0.31, 'D': 0.4},
+ )
+
+
+def test_value_policy_rejects_non_finite_fallback() -> None:
+ with pytest.raises(PolicyError):
+ ar.ValuePolicy(
+ base=ar.DatasetRef('covalent_radius', 'cordero2008'),
+ fallback=float('nan'),
+ )
+
+
+def test_substitution_transfer_accepts_policy_source() -> None:
+ custom = ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef('covalent_radius', 'demo_user_cov'),
+ values={'C': 0.77},
+ name='Demo covalent set',
+ units='angstrom',
+ )
+ policy = ar.ValuePolicy(
+ base=custom,
+ transfers=(ar.SubstitutionTransfer(source=ar.DEFAULT_COVALENT_POLICY),),
+ )
+ lookup = ar.lookup_value('Bk', policy=policy)
+ assert lookup.source == 'transfer_substitution'
+ assert lookup.value == pytest.approx(1.54)
+ assert lookup.transfer_depth == 2
+ assert lookup.resolved_from == (
+ ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),
+ )
+ assert any('policy source' in note for note in lookup.notes)
+
+
+def test_linear_transfer_accepts_policy_predictor() -> None:
+ predictor_policy = ar.ValuePolicy(base=ar.DatasetRef('atomic_radius', 'rahm2016'))
+ policy = ar.RadiiPolicy(
+ kind='van_der_waals',
+ base_set='alvarez2013',
+ transfers=(ar.LinearTransfer(predictors=(predictor_policy,),),),
+ )
+ lookup = ar.lookup_vdw_radius('Pm', policy=policy)
+ assert lookup.source == 'transfer_linear'
+ assert lookup.value == pytest.approx(ar.lookup_vdw_radius('Pm').value)
+ assert lookup.transfer_depth == 1
+ assert lookup.fit is not None
+ assert any('policy source' in note for note in lookup.notes)
+
+
+def test_linear_transfer_defaults_allow_direct_fit_and_one_nested_prediction() -> None:
+ predictor_policy = _make_partial_covalent_policy(include_o=True)
+ policy = ar.XHPolicy(
+ base_set='csd_legacy_xh_cno',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(predictor_policy,),
+ min_points=3,
+ exclude_placeholders=True,
+ ),
+ ),
+ )
+ lookup = ar.lookup_xh_bond_length('S', policy=policy)
+ assert lookup.source == 'transfer_linear'
+ assert lookup.transfer_depth == 2
+ assert lookup.fit is not None
+ assert lookup.fit.n_points == 3
+ assert lookup.value == pytest.approx(ar.lookup_xh_bond_length('S').value)
+
+
+def test_linear_transfer_fit_restrictions_block_inference_on_inference_by_default(
+) -> None:
+ predictor_policy = _make_partial_covalent_policy(include_o=False)
+ policy = ar.XHPolicy(
+ base_set='csd_legacy_xh_cno',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(predictor_policy,),
+ min_points=3,
+ exclude_placeholders=True,
+ ),
+ ),
+ )
+ with pytest.raises(PolicyError, match='fit-source restrictions'):
+ ar.lookup_xh_bond_length('S', policy=policy)
+
+
+def test_linear_transfer_fit_restrictions_can_be_relaxed_explicitly() -> None:
+ predictor_policy = _make_partial_covalent_policy(include_o=False)
+ policy = ar.XHPolicy(
+ base_set='csd_legacy_xh_cno',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(predictor_policy,),
+ min_points=3,
+ exclude_placeholders=True,
+ fit_sources=('base', 'override', 'transfer_linear'),
+ fit_max_depth=1,
+ ),
+ ),
+ )
+ lookup = ar.lookup_xh_bond_length('S', policy=policy)
+ assert lookup.source == 'transfer_linear'
+ assert lookup.fit is not None
+ assert lookup.fit.n_points == 3
+
+
+def test_linear_transfer_prediction_depth_can_be_tightened() -> None:
+ predictor_policy = _make_partial_covalent_policy(include_o=True)
+ policy = ar.XHPolicy(
+ base_set='csd_legacy_xh_cno',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(predictor_policy,),
+ min_points=3,
+ exclude_placeholders=True,
+ prediction_max_depth=0,
+ ),
+ ),
+ )
+ lookup = ar.lookup_xh_bond_length('S', policy=policy)
+ assert lookup.value is None
+ assert lookup.source == 'missing'
+ assert any('prediction_max_depth' in note for note in lookup.notes)
+
+
+def test_linear_transfer_rejects_invalid_nested_source_configuration() -> None:
+ with pytest.raises(PolicyError, match='fit_max_depth'):
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),),
+ fit_max_depth=-1,
+ )
+ with pytest.raises(PolicyError, match='allowed values'):
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef('covalent_radius', 'cordero2008'),),
+ prediction_sources=('missing',), # type: ignore[arg-type]
+ )
+
+
+def test_lookup_value_detects_generic_policy_cycles() -> None:
+ empty_1 = _make_custom_set('covalent_radius', 'cycle_empty_1', {})
+ empty_2 = _make_custom_set('covalent_radius', 'cycle_empty_2', {})
+ policy_1 = ar.ValuePolicy(base=empty_1)
+ policy_2 = ar.ValuePolicy(
+ base=empty_2,
+ transfers=(ar.SubstitutionTransfer(source=policy_1),),
+ )
+ object.__setattr__(
+ policy_1,
+ 'transfers',
+ (ar.SubstitutionTransfer(source=policy_2),),
+ )
+
+ with pytest.raises(PolicyError, match='cyclic policy resolution detected'):
+ ar.lookup_value('C', policy=policy_1)
+
+
+def test_wrapper_policy_cycles_are_detected() -> None:
+ empty = _make_custom_set('covalent_radius', 'demo_empty_cov', {})
+ wrapper_a = _DemoPolicyWrapper(base=empty)
+ wrapper_b = _DemoPolicyWrapper(base=empty, source=wrapper_a)
+ wrapper_a.source = wrapper_b
+
+ policy = ar.ValuePolicy(
+ base=empty,
+ transfers=(ar.SubstitutionTransfer(source=wrapper_a),),
+ )
+ with pytest.raises(PolicyError, match='cyclic policy resolution detected'):
+ ar.lookup_value('C', policy=policy)
diff --git a/tests/radii/test_assessment.py b/tests/radii/test_assessment.py
new file mode 100644
index 0000000..664d867
--- /dev/null
+++ b/tests/radii/test_assessment.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import atomref as ar
+
+
+def test_assess_vdw_default_linear_counts() -> None:
+ rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY)
+ assert rep.kind == 'van_der_waals'
+ assert rep.n_elements == 2
+ assert rep.n_base == 1
+ assert rep.n_transfer_linear == 1
+ assert rep.n_missing == 0
+ assert rep.fits
+ assert rep.fits[0].n_points == 90
+
+
+def test_assess_vdw_detail_reports_sources() -> None:
+ rep = ar.assess_radii_policy(['Pm', 'O'], policy=ar.DEFAULT_VDW_POLICY, detail=True)
+ by_sym = {d.symbol: d for d in rep.per_element}
+ assert by_sym['O'].lookup.source == 'base'
+ assert by_sym['Pm'].lookup.source == 'transfer_linear'
+
+
+def test_assess_covalent_sub_placeholder_count() -> None:
+ rep = ar.assess_radii_policy(['Es'], policy=ar.DEFAULT_COVALENT_POLICY)
+ assert rep.kind == 'covalent'
+ assert rep.n_elements == 1
+ assert rep.n_transfer_substitution == 1
+ assert rep.n_placeholders == 1
+ assert rep.placeholder_symbols == ('Es',)
+ assert rep.n_missing == 0
+
+
+def test_assess_covalent_missing_in_both_sets() -> None:
+ rep = ar.assess_radii_policy(['Rg'], policy=ar.DEFAULT_COVALENT_POLICY)
+ assert rep.n_missing == 1
+ assert rep.missing_symbols == ('Rg',)
diff --git a/tests/radii/test_selection.py b/tests/radii/test_selection.py
new file mode 100644
index 0000000..8977363
--- /dev/null
+++ b/tests/radii/test_selection.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+import pytest
+
+import atomref as ar
+from atomref.errors import PolicyError
+
+
+def test_get_covalent_radius_default_prefers_cordero() -> None:
+ assert ar.get_covalent_radius("C") == pytest.approx(0.76)
+
+
+def test_get_covalent_radius_maps_deuterium_to_hydrogen() -> None:
+ assert ar.get_covalent_radius("D") == pytest.approx(0.31)
+
+
+def test_get_vdw_radius_default_prefers_alvarez() -> None:
+ assert ar.get_vdw_radius("C") == pytest.approx(1.77)
+
+
+def test_completion_is_used_for_missing_base_values() -> None:
+ m = ar.lookup_covalent_radius("Bk")
+ assert m.value is not None
+ assert m.source == "transfer_substitution"
+
+ m2 = ar.lookup_vdw_radius("Pm")
+ assert m2.value is not None
+ assert m2.source == "transfer_linear"
+ assert m2.value == pytest.approx(2.897226539514835)
+
+
+def test_linear_transfer_rejects_placeholder_values() -> None:
+ scheme = ar.RadiiPolicy(
+ kind="van_der_waals",
+ base_set="bondi1964",
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),)
+ ),
+ ),
+ )
+ m = ar.lookup_vdw_radius("Be", policy=scheme)
+ assert m.value is None
+ assert m.source == "missing"
+ assert any("placeholder" in s for s in m.notes)
+
+
+def test_lookup_float_conversion() -> None:
+ m = ar.lookup_covalent_radius("C")
+ assert float(m) == pytest.approx(0.76)
+
+ m_missing = ar.lookup_covalent_radius("Xx")
+ with pytest.raises(TypeError):
+ float(m_missing)
+
+
+def test_override_precedes_base_value() -> None:
+ policy = ar.RadiiPolicy(
+ kind="covalent",
+ base_set="cordero2008",
+ overrides={"C": 9.99},
+ )
+ lookup = ar.lookup_covalent_radius("C", policy=policy)
+ assert lookup.source == "override"
+ assert lookup.value == pytest.approx(9.99)
+
+
+def test_fallback_is_used_only_after_transfers_fail() -> None:
+ policy = ar.RadiiPolicy(
+ kind="van_der_waals",
+ base_set="bondi1964",
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DatasetRef("van_der_waals_radius", "csd_legacy_vdw"),)
+ ),
+ ),
+ fallback=2.5,
+ )
+ lookup = ar.lookup_vdw_radius("Be", policy=policy)
+ assert lookup.source == "fallback"
+ assert lookup.value == pytest.approx(2.5)
+ assert any("placeholder" in note for note in lookup.notes)
+
+
+def test_linear_transfer_rejects_multiple_predictors_in_v0_1() -> None:
+ policy = ar.RadiiPolicy(
+ kind="van_der_waals",
+ base_set="alvarez2013",
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(
+ ar.DatasetRef("atomic_radius", "rahm2016"),
+ ar.DatasetRef("covalent_radius", "cordero2008"),
+ )
+ ),
+ ),
+ )
+ with pytest.raises(PolicyError):
+ ar.lookup_vdw_radius("Pm", policy=policy)
+
+
+def test_base_placeholder_note_is_explicit() -> None:
+ policy = ar.RadiiPolicy(kind='covalent', base_set='csd_legacy_cov')
+ lookup = ar.lookup_covalent_radius('Es', policy=policy)
+ assert lookup.source == 'base'
+ assert lookup.is_placeholder is True
+ assert any('placeholder' in note for note in lookup.notes)
+
+
+def test_substitution_placeholder_note_is_explicit() -> None:
+ lookup = ar.lookup_covalent_radius('Es')
+ assert lookup.source == 'transfer_substitution'
+ assert lookup.is_placeholder is True
+ assert any('placeholder' in note for note in lookup.notes)
+
+
+def test_radii_policy_rejects_normalized_override_collisions() -> None:
+ policy = ar.RadiiPolicy(
+ kind='covalent',
+ base_set='cordero2008',
+ overrides={'H': 0.31, 'D': 0.4},
+ )
+ with pytest.raises(PolicyError):
+ ar.lookup_covalent_radius('H', policy=policy)
+
+
+def test_radii_policy_rejects_non_finite_override() -> None:
+ policy = ar.RadiiPolicy(
+ kind='covalent',
+ base_set='cordero2008',
+ overrides={'C': float('nan')},
+ )
+ with pytest.raises(PolicyError):
+ ar.lookup_covalent_radius('C', policy=policy)
+
+
+def test_radii_policy_rejects_negative_fallback() -> None:
+ policy = ar.RadiiPolicy(
+ kind='van_der_waals',
+ base_set='bondi1964',
+ fallback=-1.0,
+ )
+ with pytest.raises(PolicyError):
+ ar.lookup_vdw_radius('Be', policy=policy)
+
+
+def test_linear_transfer_validates_empty_predictors() -> None:
+ with pytest.raises(PolicyError):
+ ar.LinearTransfer(predictors=())
diff --git a/tests/registry/test_registry.py b/tests/registry/test_registry.py
new file mode 100644
index 0000000..d497d9f
--- /dev/null
+++ b/tests/registry/test_registry.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+from importlib import resources
+from types import MappingProxyType
+
+import pytest
+
+import atomref as ar
+from atomref.errors import DatasetError
+from atomref.registry import get_builtin_set
+
+
+def test_packaged_data_files_exist() -> None:
+ pkg = 'atomref.data'
+ assert resources.files(pkg).joinpath('periodic_table.csv').is_file()
+ assert resources.files(pkg).joinpath('covalent.csv').is_file()
+ assert resources.files(pkg).joinpath('van_der_waals.csv').is_file()
+ assert resources.files(pkg).joinpath('registry.json').is_file()
+
+
+def test_registry_lists_vdw_sets_but_not_atomic_support_sets() -> None:
+ vdw_sets = ar.list_radii_sets('van_der_waals')
+ assert 'alvarez2013' in vdw_sets
+ assert 'rahm2016' not in vdw_sets
+
+
+def test_rahm_is_registered_as_atomic_radius() -> None:
+ info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016'))
+ assert info.ref.quantity == 'atomic_radius'
+ assert info.semantic_class == 'atomic_isodensity'
+ assert info.phase_context == 'isolated_atom'
+
+
+def test_builtin_set_loading_works() -> None:
+ ds = get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008'))
+ assert ds.get('C') == 0.76
+
+
+def test_list_quantities_and_quantity_info() -> None:
+ quantities = ar.list_quantities()
+ assert quantities == (
+ 'covalent_radius',
+ 'van_der_waals_radius',
+ 'atomic_radius',
+ 'xh_bond_length',
+ )
+
+ info = ar.get_quantity_info('atomic_radius')
+ assert info.quantity == 'atomic_radius'
+ assert info.domain == 'element'
+ assert info.units == 'angstrom'
+ assert 'support' in (info.description or '')
+
+
+def test_rahm_note_no_longer_claims_it_is_classified_as_vdw() -> None:
+ info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016'))
+ joined = ' '.join(info.notes).lower()
+ assert 'classified as vdw' not in joined
+ assert 'atomic support data' in joined
+
+
+def test_usage_role_is_exposed_on_dataset_info() -> None:
+ info = ar.get_dataset_info(ar.DatasetRef('atomic_radius', 'rahm2016'))
+ assert info.usage_role == 'support'
+
+
+def test_list_dataset_ids_can_filter_by_usage_role() -> None:
+ assert ar.list_dataset_ids('atomic_radius', usage_role='support') == ('rahm2016',)
+ assert ar.list_dataset_ids('van_der_waals_radius', usage_role='target') == (
+ 'bondi1964',
+ 'rowland_taylor1996',
+ 'alvarez2013',
+ 'chernyshov2020',
+ )
+
+
+def test_list_radii_sets_can_filter_by_usage_role() -> None:
+ assert ar.list_radii_sets('covalent', usage_role='support') == ('csd_legacy_cov',)
+ assert 'alvarez2013' in ar.list_radii_sets('van_der_waals', usage_role='target')
+
+
+def test_list_dataset_infos_can_filter_by_usage_role() -> None:
+ infos = ar.list_dataset_infos('atomic_radius', usage_role='support')
+ assert tuple(info.ref.set_id for info in infos) == ('rahm2016',)
+ assert all(info.usage_role == 'support' for info in infos)
+
+
+def test_list_radii_set_infos_can_filter_by_usage_role() -> None:
+ infos = ar.list_radii_set_infos('van_der_waals', usage_role='target')
+ assert 'alvarez2013' in {info.ref.set_id for info in infos}
+ assert all(info.ref.quantity == 'van_der_waals_radius' for info in infos)
+
+
+def test_public_builtin_set_helper_is_exported() -> None:
+ ds = ar.get_builtin_set(ar.DatasetRef('covalent_radius', 'cordero2008'))
+ assert ds.info.ref.quantity == 'covalent_radius'
+ assert ds.get('C') == 0.76
+
+
+def test_public_radii_set_helper_returns_packaged_radii_set() -> None:
+ ds = ar.get_radii_set('van_der_waals', 'alvarez2013')
+ assert ds.info.ref.quantity == 'van_der_waals_radius'
+ assert ds.info.ref.set_id == 'alvarez2013'
+ assert ds.get('O') == 1.5
+
+
+def test_dataset_info_storage_is_frozen() -> None:
+ info = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008'))
+ assert isinstance(info.storage, MappingProxyType)
+ assert info.storage['column'] == 'cordero2008'
+ with pytest.raises(TypeError):
+ info.storage['column'] = 'broken'
+
+ fresh = ar.get_dataset_info(ar.DatasetRef('covalent_radius', 'cordero2008'))
+ assert fresh.storage is not None
+ assert fresh.storage['column'] == 'cordero2008'
+
+
+def test_dataset_alias_resolution_normalizes_dash_variants() -> None:
+ info = ar.get_dataset_info(
+ ar.DatasetRef('covalent_radius', 'Cordero-Alvarez covalent radii')
+ )
+ assert info.ref.set_id == 'cordero2008'
+
+
+def test_custom_set_rejects_normalized_key_collisions() -> None:
+ with pytest.raises(DatasetError):
+ ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef('covalent_radius', 'demo'),
+ values={'H': 0.31, 'D': 0.5},
+ name='Demo',
+ units='angstrom',
+ )
+
+
+def test_custom_set_rejects_non_finite_values() -> None:
+ with pytest.raises(DatasetError):
+ ar.ElementScalarSet.from_mapping(
+ ref=ar.DatasetRef('covalent_radius', 'demo'),
+ values={'C': float('nan')},
+ name='Demo',
+ units='angstrom',
+ )
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
new file mode 100644
index 0000000..6a96b08
--- /dev/null
+++ b/tests/test_smoke.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+import atomref as ar
+
+
+def test_version_is_present() -> None:
+ assert isinstance(ar.__version__, str)
+ assert ar.__version__
+
+
+def test_basic_smoke_import_and_lookup() -> None:
+ assert ar.get_covalent_radius('C') == 0.76
+ assert ar.get_vdw_radius('C') == 1.77
diff --git a/tests/xh/test_xh.py b/tests/xh/test_xh.py
new file mode 100644
index 0000000..3cffe15
--- /dev/null
+++ b/tests/xh/test_xh.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import pytest
+
+import atomref as ar
+from atomref.errors import PolicyError
+
+
+def test_get_xh_bond_length_returns_curated_cno_values() -> None:
+ assert ar.get_xh_bond_length('C') == pytest.approx(1.089)
+ assert ar.get_xh_bond_length('N') == pytest.approx(1.015)
+ assert ar.get_xh_bond_length('O') == pytest.approx(0.993)
+
+
+def test_lookup_xh_bond_length_infers_other_elements_from_cordero() -> None:
+ lookup = ar.lookup_xh_bond_length('S')
+ assert lookup.source == 'transfer_linear'
+ assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'cordero2008'),)
+ assert lookup.fit is not None
+ assert lookup.fit.n_points == 3
+ assert lookup.value == pytest.approx(1.3587333333333333)
+
+
+def test_lookup_xh_bond_length_rejects_h_as_parent_element() -> None:
+ lookup = ar.lookup_xh_bond_length('H')
+ assert lookup.value is None
+ assert lookup.source == 'missing'
+ assert any('not a valid parent element' in note for note in lookup.notes)
+
+
+def test_list_xh_sets_and_metadata() -> None:
+ assert ar.list_xh_sets() == ('csd_legacy_xh_cno',)
+ info = ar.get_xh_set_info('csd_legacy_xh_cno')
+ assert info.ref.quantity == 'xh_bond_length'
+ assert info.usage_role == 'target'
+ assert info.coverage is not None
+ assert info.coverage.n_values == 3
+
+
+def test_xh_policy_rejects_h_override_key() -> None:
+ policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', overrides={'H': 1.0})
+ with pytest.raises(PolicyError):
+ policy.as_value_policy()
+
+
+def test_xh_policy_rejects_negative_fallback() -> None:
+ policy = ar.XHPolicy(base_set='csd_legacy_xh_cno', fallback=-1.0)
+ with pytest.raises(PolicyError):
+ policy.as_value_policy()
+
+
+def test_xh_policy_accepts_wrapper_policy_predictor() -> None:
+ policy = ar.XHPolicy(
+ base_set='csd_legacy_xh_cno',
+ transfers=(
+ ar.LinearTransfer(
+ predictors=(ar.DEFAULT_COVALENT_POLICY,),
+ min_points=3,
+ exclude_placeholders=True,
+ ),
+ ),
+ )
+ lookup = ar.lookup_xh_bond_length('Bk', policy=policy)
+ assert lookup.source == 'transfer_linear'
+ assert lookup.value == pytest.approx(1.8291333333333335)
+ assert lookup.resolved_from == (ar.DatasetRef('covalent_radius', 'csd_legacy_cov'),)
+ assert any('policy source' in note for note in lookup.notes)
diff --git a/tools/README.md b/tools/README.md
new file mode 100644
index 0000000..943900d
--- /dev/null
+++ b/tools/README.md
@@ -0,0 +1,30 @@
+# tools
+
+This directory contains small maintenance scripts used during development and
+release preparation.
+
+## Scripts
+
+- `check_dist.py` — verify that wheel and source-distribution artifacts contain
+ the key files expected by the project.
+- `check_notebooks.py` — validate notebook JSON and execute notebook code cells.
+- `check_registry.py` — validate curated registry metadata against packaged CSV
+ tables.
+- `export_notebooks.py` — render the bundled notebooks into Markdown pages under
+ `docs/notebooks/`.
+- `gen_readme.py` — regenerate `README.md` from `docs/index.md`.
+- `release_check.py` — run the full release-preparation checklist,
+ including linting, tests, docs, builds, and artifact validation.
+
+## Typical commands
+
+```bash
+python tools/check_registry.py
+python tools/check_notebooks.py
+python tools/export_notebooks.py
+python tools/gen_readme.py
+python tools/release_check.py
+```
+
+The main project README is generated from the documentation home page. To change
+`README.md`, edit `docs/index.md` and then run `python tools/gen_readme.py`.
diff --git a/tools/check_dist.py b/tools/check_dist.py
new file mode 100644
index 0000000..df70910
--- /dev/null
+++ b/tools/check_dist.py
@@ -0,0 +1,116 @@
+"""Verify that built distributions contain the project's key files."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+import tarfile
+import zipfile
+
+
+REQUIRED_WHEEL_MEMBERS = {
+ "atomref/data/periodic_table.csv",
+ "atomref/data/covalent.csv",
+ "atomref/data/van_der_waals.csv",
+ "atomref/data/registry.json",
+ "atomref/py.typed",
+}
+
+REQUIRED_SDIST_SUFFIXES = {
+ "src/atomref/data/periodic_table.csv",
+ "src/atomref/data/covalent.csv",
+ "src/atomref/data/van_der_waals.csv",
+ "src/atomref/data/registry.json",
+ "src/atomref/py.typed",
+ "README.md",
+ "CHANGELOG.md",
+ "DEV_PLAN.md",
+ "LICENSE",
+ "pyproject.toml",
+ "notebooks/01-quickstart.ipynb",
+ "notebooks/02-policies-and-assessment.ipynb",
+ "notebooks/03-custom-sets-and-discovery.ipynb",
+ "docs/notebooks/01-quickstart.md",
+ "docs/notebooks/02-policies-and-assessment.md",
+ "docs/notebooks/03-custom-sets-and-discovery.md",
+ "tools/check_notebooks.py",
+ "tools/export_notebooks.py",
+ "tools/gen_readme.py",
+ "tools/release_check.py",
+ "tools/README.md",
+}
+
+
+class DistCheckError(RuntimeError):
+ """Raised when a built distribution is missing required members."""
+
+
+def _assert_members_present(
+ actual: set[str],
+ required: set[str],
+ *,
+ label: str,
+) -> None:
+ """Raise when ``required`` contains members not present in ``actual``."""
+
+ missing = sorted(required - actual)
+ if missing:
+ joined = ", ".join(missing)
+ raise DistCheckError(f"{label} is missing required members: {joined}")
+
+
+def _members_matching_suffixes(actual: set[str], suffixes: set[str]) -> set[str]:
+ """Return suffixes that match at least one member name from ``actual``."""
+
+ matched: set[str] = set()
+ for suffix in suffixes:
+ if any(name.endswith(suffix) for name in actual):
+ matched.add(suffix)
+ return matched
+
+
+def check_wheel(path: Path) -> None:
+ """Validate the contents of one built wheel."""
+
+ with zipfile.ZipFile(path) as zf:
+ names = set(zf.namelist())
+ matched = {
+ member
+ for member in REQUIRED_WHEEL_MEMBERS
+ if any(name.endswith(member) for name in names)
+ }
+ _assert_members_present(matched, REQUIRED_WHEEL_MEMBERS, label=path.name)
+
+
+def check_sdist(path: Path) -> None:
+ """Validate the contents of one built source distribution."""
+
+ with tarfile.open(path, "r:gz") as tf:
+ names = {member.name for member in tf.getmembers()}
+ matched = _members_matching_suffixes(names, REQUIRED_SDIST_SUFFIXES)
+ _assert_members_present(matched, REQUIRED_SDIST_SUFFIXES, label=path.name)
+
+
+def main() -> None:
+ """Validate wheel and sdist artifacts found in a distribution directory."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("dist_dir", type=Path, nargs="?", default=Path("dist"))
+ args = parser.parse_args()
+
+ dist_dir = args.dist_dir
+ wheels = sorted(dist_dir.glob("*.whl"))
+ sdists = sorted(dist_dir.glob("*.tar.gz"))
+ if not wheels:
+ raise DistCheckError(f"no wheel files found in {dist_dir}")
+ if not sdists:
+ raise DistCheckError(f"no source distributions found in {dist_dir}")
+
+ for wheel in wheels:
+ check_wheel(wheel)
+ for sdist in sdists:
+ check_sdist(sdist)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/check_notebooks.py b/tools/check_notebooks.py
new file mode 100644
index 0000000..51d9dfa
--- /dev/null
+++ b/tools/check_notebooks.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Validate notebook JSON structure and execute notebook code cells."""
+
+from __future__ import annotations
+
+from contextlib import redirect_stdout
+import io
+import json
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SRC = REPO_ROOT / "src"
+if str(SRC) not in sys.path:
+ sys.path.insert(0, str(SRC))
+
+NOTEBOOKS = REPO_ROOT / "notebooks"
+REQUIRED_NOTEBOOKS = (
+ "01-quickstart.ipynb",
+ "02-policies-and-assessment.ipynb",
+ "03-custom-sets-and-discovery.ipynb",
+)
+
+
+class NotebookCheckError(RuntimeError):
+ """Raised when a notebook is malformed or fails to execute."""
+
+
+def iter_notebooks() -> tuple[Path, ...]:
+ """Return the notebooks that are expected to ship with the project."""
+
+ return tuple(NOTEBOOKS / name for name in REQUIRED_NOTEBOOKS)
+
+
+def load_notebook(path: Path) -> dict[str, object]:
+ """Load one notebook JSON document."""
+
+ data = json.loads(path.read_text(encoding="utf-8"))
+ if not isinstance(data, dict):
+ raise NotebookCheckError(f"{path.name}: expected top-level JSON object")
+ return data
+
+
+def iter_code_cells(data: dict[str, object], *, path: Path) -> tuple[str, ...]:
+ """Return notebook code-cell sources in order."""
+
+ cells = data.get("cells")
+ if not isinstance(cells, list):
+ raise NotebookCheckError(f"{path.name}: missing notebook cell list")
+
+ code: list[str] = []
+ for index, cell in enumerate(cells):
+ if not isinstance(cell, dict):
+ raise NotebookCheckError(f"{path.name}: cell {index} is not an object")
+ cell_type = cell.get("cell_type")
+ if cell_type != "code":
+ continue
+ source = cell.get("source", [])
+ if isinstance(source, str):
+ text = source
+ elif isinstance(source, list) and all(isinstance(line, str) for line in source):
+ text = "".join(source)
+ else:
+ raise NotebookCheckError(
+ f"{path.name}: cell {index} has invalid code source"
+ )
+ code.append(text)
+ if not code:
+ raise NotebookCheckError(f"{path.name}: contains no code cells")
+ return tuple(code)
+
+
+def execute_notebook(path: Path) -> None:
+ """Execute all code cells from one notebook in a shared namespace."""
+
+ if not path.exists():
+ raise NotebookCheckError(f"missing notebook: {path}")
+ data = load_notebook(path)
+ namespace = {"__name__": "__main__"}
+ for index, source in enumerate(iter_code_cells(data, path=path), start=1):
+ if not source.strip():
+ continue
+ try:
+ code = compile(source, f"{path.name}::cell{index}", "exec")
+ with redirect_stdout(io.StringIO()):
+ exec(code, namespace, namespace)
+ except Exception as exc: # noqa: BLE001
+ raise NotebookCheckError(
+ f"{path.name}: execution failed in code cell {index}: {exc}"
+ ) from exc
+
+
+def main() -> int:
+ """Validate and execute every required notebook."""
+
+ notebooks = iter_notebooks()
+ for notebook in notebooks:
+ execute_notebook(notebook)
+ print(f"Validated {len(notebooks)} notebook(s).")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/check_registry.py b/tools/check_registry.py
new file mode 100644
index 0000000..3af6025
--- /dev/null
+++ b/tools/check_registry.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Validate packaged registry metadata against bundled CSV tables."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import asdict
+from importlib import import_module
+from pathlib import Path
+import sys
+from typing import Iterable
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SRC = REPO_ROOT / "src"
+if str(SRC) not in sys.path:
+ sys.path.insert(0, str(SRC))
+
+_ALLOWED_USAGE_ROLES = {"target", "support"}
+
+
+def _load_atomref_module():
+ return import_module("atomref")
+
+
+def _get_builtin_set(ref):
+ registry = import_module("atomref.registry")
+ return registry.get_builtin_set(ref)
+
+
+def _canonical_token(value: str) -> str:
+ registry = import_module("atomref.registry")
+ return registry._canonicalize_alias_token(value)
+
+
+def _iter_dataset_refs() -> Iterable[object]:
+ ar = _load_atomref_module()
+ for quantity in ar.list_quantities():
+ for info in ar.list_dataset_infos(quantity):
+ yield info.ref
+
+
+def _validate_alias_collisions(errors: list[str]) -> None:
+ ar = _load_atomref_module()
+ for quantity in ar.list_quantities():
+ seen: dict[str, str] = {}
+ for info in ar.list_dataset_infos(quantity):
+ set_id = info.ref.set_id
+ for token in (set_id, *info.aliases):
+ key = _canonical_token(token)
+ previous = seen.get(key)
+ if previous is not None and previous != set_id:
+ msg = (
+ f"alias collision in {quantity!r}: {token!r} resolves to both "
+ f"{previous!r} and {set_id!r}"
+ )
+ errors.append(msg)
+ else:
+ seen[key] = set_id
+
+
+def _validate_dataset_metadata(errors: list[str]) -> None:
+ ar = _load_atomref_module()
+ quantities = set(ar.list_quantities())
+ by_role: dict[str, list[str]] = defaultdict(list)
+
+ for ref in _iter_dataset_refs():
+ quantity_info = ar.get_quantity_info(ref.quantity)
+ info = ar.get_dataset_info(ref)
+ dataset = _get_builtin_set(ref)
+
+ if info.ref != ref:
+ errors.append(f"dataset ref mismatch: requested {ref!r}, got {info.ref!r}")
+
+ if info.domain != quantity_info.domain:
+ msg = (
+ f"domain mismatch for {ref!r}: quantity={quantity_info.domain!r}, "
+ f"dataset={info.domain!r}"
+ )
+ errors.append(msg)
+
+ if info.units != quantity_info.units:
+ msg = (
+ f"units mismatch for {ref!r}: quantity={quantity_info.units!r}, "
+ f"dataset={info.units!r}"
+ )
+ errors.append(msg)
+
+ if info.usage_role not in _ALLOWED_USAGE_ROLES:
+ errors.append(f"invalid usage_role for {ref!r}: {info.usage_role!r}")
+ else:
+ by_role[info.usage_role].append(ref.quantity)
+
+ if not info.references:
+ errors.append(f"missing references for {ref!r}")
+
+ if info.storage is None:
+ errors.append(f"missing storage metadata for {ref!r}")
+ else:
+ filename = info.storage.get("filename")
+ column = info.storage.get("column")
+ fmt = info.storage.get("format")
+ if not isinstance(filename, str) or not filename:
+ errors.append(f"invalid storage filename for {ref!r}: {filename!r}")
+ if not isinstance(column, str) or not column:
+ errors.append(f"invalid storage column for {ref!r}: {column!r}")
+ if fmt != "dense_by_z_csv":
+ errors.append(f"unsupported storage format for {ref!r}: {fmt!r}")
+
+ coverage = info.coverage
+ if coverage is None:
+ errors.append(f"missing coverage metadata for {ref!r}")
+ max_z = len(dataset.values_by_z) - 1
+ else:
+ max_z = (
+ coverage.z_max
+ if coverage.z_max is not None
+ else len(dataset.values_by_z) - 1
+ )
+
+ covered_z = tuple(
+ z
+ for z, value in enumerate(dataset.values_by_z)
+ if z > 0 and value is not None and z <= max_z
+ )
+ covered_set = set(covered_z)
+ missing_z = tuple(z for z in range(1, max_z + 1) if z not in covered_set)
+ has_placeholders = info.placeholder_value is not None and any(
+ value is not None and abs(value - info.placeholder_value) < 1e-12
+ for value in dataset.values_by_z[1 : max_z + 1]
+ )
+
+ if coverage is not None:
+ expected = {
+ "n_values": len(covered_z),
+ "z_min": min(covered_z) if covered_z else None,
+ "z_max": max(covered_z) if covered_z else None,
+ "has_placeholders": has_placeholders,
+ }
+ actual = asdict(coverage)
+ for key, value in expected.items():
+ if actual[key] != value:
+ msg = (
+ f"coverage mismatch for {ref!r}: {key} is {actual[key]!r}, "
+ f"expected {value!r}"
+ )
+ errors.append(msg)
+ if actual["covered_z"] and tuple(actual["covered_z"]) != covered_z:
+ msg = (
+ f"coverage mismatch for {ref!r}: covered_z is "
+ f"{actual['covered_z']!r}, expected {covered_z!r}"
+ )
+ errors.append(msg)
+ if actual["missing_z"] and tuple(actual["missing_z"]) != missing_z:
+ msg = (
+ f"coverage mismatch for {ref!r}: missing_z is "
+ f"{actual['missing_z']!r}, expected {missing_z!r}"
+ )
+ errors.append(msg)
+
+ if ref.quantity not in quantities:
+ errors.append(f"dataset refers to unknown quantity: {ref!r}")
+
+ for quantity in quantities:
+ if quantity not in by_role.get("target", []) and quantity != "atomic_radius":
+ errors.append(f"quantity {quantity!r} has no target datasets")
+
+
+def main() -> int:
+ errors: list[str] = []
+ _validate_alias_collisions(errors)
+ _validate_dataset_metadata(errors)
+
+ if errors:
+ for error in errors:
+ print(f"ERROR: {error}")
+ return 1
+
+ print("Registry validation passed.")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/export_notebooks.py b/tools/export_notebooks.py
new file mode 100644
index 0000000..aa6761d
--- /dev/null
+++ b/tools/export_notebooks.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""Export bundled notebooks to Markdown pages for the docs."""
+
+from __future__ import annotations
+
+from contextlib import redirect_stdout
+import argparse
+import io
+import json
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SRC = REPO_ROOT / "src"
+if str(SRC) not in sys.path:
+ sys.path.insert(0, str(SRC))
+
+NOTEBOOKS = REPO_ROOT / "notebooks"
+DEFAULT_OUTPUT_DIR = REPO_ROOT / "docs" / "notebooks"
+NOTEBOOK_OUTPUTS = {
+ "01-quickstart.ipynb": "01-quickstart.md",
+ "02-policies-and-assessment.ipynb": "02-policies-and-assessment.md",
+ "03-custom-sets-and-discovery.ipynb": "03-custom-sets-and-discovery.md",
+}
+HEADER = (
+ "\n"
+ "\n\n"
+)
+
+
+class NotebookExportError(RuntimeError):
+ """Raised when notebook export fails."""
+
+
+def _load_notebook(path: Path) -> dict[str, object]:
+ """Load one notebook JSON document."""
+
+ data = json.loads(path.read_text(encoding="utf-8"))
+ if not isinstance(data, dict):
+ raise NotebookExportError(f"{path.name}: expected top-level JSON object")
+ return data
+
+
+def _cell_source(cell: dict[str, object], *, path: Path, index: int) -> str:
+ """Return normalized source text for one notebook cell."""
+
+ source = cell.get("source", [])
+ if isinstance(source, str):
+ return source
+ if isinstance(source, list) and all(isinstance(line, str) for line in source):
+ return "".join(source)
+ raise NotebookExportError(f"{path.name}: invalid source in cell {index}")
+
+
+def _export_markdown(path: Path) -> str:
+ """Render one notebook as Markdown, executing code cells for output."""
+
+ data = _load_notebook(path)
+ cells = data.get("cells")
+ if not isinstance(cells, list):
+ raise NotebookExportError(f"{path.name}: missing notebook cell list")
+
+ namespace = {"__name__": "__main__"}
+ parts: list[str] = [HEADER]
+ parts.append(
+ f"[Open the original notebook on GitHub]"
+ f"(https://github.com/DeloneCommons/atomref/blob/main/notebooks/{path.name})\n"
+ )
+
+ for index, cell in enumerate(cells, start=1):
+ if not isinstance(cell, dict):
+ raise NotebookExportError(f"{path.name}: cell {index} is not an object")
+ source = _cell_source(cell, path=path, index=index)
+ cell_type = cell.get("cell_type")
+ if cell_type == "markdown":
+ text = source.strip()
+ if text:
+ parts.append(f"{text}\n")
+ continue
+ if cell_type != "code":
+ continue
+ code_text = source.rstrip()
+ parts.append("```python\n")
+ parts.append(f"{code_text}\n")
+ parts.append("```\n")
+ if not code_text.strip():
+ continue
+
+ stdout = io.StringIO()
+ try:
+ code = compile(code_text, f"{path.name}::cell{index}", "exec")
+ with redirect_stdout(stdout):
+ exec(code, namespace, namespace)
+ except Exception as exc: # noqa: BLE001
+ raise NotebookExportError(
+ f"{path.name}: execution failed in code cell {index}: {exc}"
+ ) from exc
+
+ output = stdout.getvalue().rstrip()
+ if output:
+ parts.append("**Output**\n\n")
+ parts.append("```text\n")
+ parts.append(f"{output}\n")
+ parts.append("```\n")
+
+ return "\n".join(part.rstrip() for part in parts if part).rstrip() + "\n"
+
+
+def export_notebooks(output_dir: Path, *, check: bool = False) -> int:
+ """Export bundled notebooks or verify that exported pages are in sync."""
+
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for notebook_name, output_name in NOTEBOOK_OUTPUTS.items():
+ notebook_path = NOTEBOOKS / notebook_name
+ rendered = _export_markdown(notebook_path)
+ output_path = output_dir / output_name
+ if check:
+ current = output_path.read_text(encoding="utf-8").replace("\r\n", "\n")
+ if current != rendered:
+ print(
+ f"{output_path} is out of sync with {notebook_path.name}",
+ file=sys.stderr,
+ )
+ return 1
+ else:
+ output_path.write_text(rendered, encoding="utf-8", newline="\n")
+ return 0
+
+
+def main() -> int:
+ """Export notebook Markdown pages or check that they are current."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
+ parser.add_argument(
+ "--check",
+ action="store_true",
+ help="exit with status 1 when exported pages are out of sync",
+ )
+ args = parser.parse_args()
+ return export_notebooks(args.output_dir, check=args.check)
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/gen_readme.py b/tools/gen_readme.py
new file mode 100644
index 0000000..71b954d
--- /dev/null
+++ b/tools/gen_readme.py
@@ -0,0 +1,61 @@
+"""Generate ``README.md`` from the documentation home page."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+SOURCE = REPO_ROOT / "docs" / "index.md"
+README = REPO_ROOT / "README.md"
+FOOTER = """
+
+---
+
+This README is generated from `docs/index.md`.
+
+To regenerate it:
+
+```bash
+python tools/gen_readme.py
+```
+
+Edit the documentation sources instead of editing `README.md` directly.
+"""
+
+
+def render_readme() -> str:
+ """Return the generated README text."""
+
+ body = SOURCE.read_text(encoding="utf-8").rstrip()
+ return f"{body}{FOOTER}"
+
+
+def main() -> int:
+ """Generate or verify the repository README file."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output", type=Path, default=README)
+ parser.add_argument(
+ "--check",
+ action="store_true",
+ help="exit with status 1 when the target file is out of sync",
+ )
+ args = parser.parse_args()
+
+ rendered = render_readme()
+ if args.check:
+ current = args.output.read_text(encoding="utf-8")
+ if current != rendered:
+ print(f"{args.output} is out of sync with docs/index.md", file=sys.stderr)
+ return 1
+ return 0
+
+ args.output.write_text(rendered, encoding="utf-8")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/release_check.py b/tools/release_check.py
new file mode 100644
index 0000000..a357a18
--- /dev/null
+++ b/tools/release_check.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""Run the full release-preparation checks for the repository.
+
+This helper is intended for local release preparation. It runs the same checks
+that are exercised separately in CI, then builds source and wheel artifacts,
+validates them, and smoke-tests the built wheel in an isolated virtual
+environment.
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+import shutil
+import subprocess
+import sys
+import tempfile
+import venv
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+DIST_DIR = REPO_ROOT / "dist"
+BUILD_DIR = REPO_ROOT / "build"
+
+
+def _run(*args: str, env: dict[str, str] | None = None) -> None:
+ """Run one subprocess command in the repository root."""
+
+ print("+", " ".join(args))
+ subprocess.run(args, cwd=REPO_ROOT, check=True, env=env)
+
+
+def _fresh_build_dirs() -> None:
+ """Remove build artifacts from previous runs."""
+
+ shutil.rmtree(DIST_DIR, ignore_errors=True)
+ shutil.rmtree(BUILD_DIR, ignore_errors=True)
+
+
+def _smoke_test_wheel() -> None:
+ """Install the built wheel into a temporary virtualenv and import it."""
+
+ wheels = sorted(DIST_DIR.glob("*.whl"))
+ if not wheels:
+ raise RuntimeError("no wheel found in dist/")
+ wheel = wheels[-1]
+
+ with tempfile.TemporaryDirectory(prefix="atomref-release-check-") as tmp:
+ env_dir = Path(tmp) / "venv"
+ builder = venv.EnvBuilder(with_pip=True)
+ builder.create(env_dir)
+ bindir = "Scripts" if sys.platform.startswith("win") else "bin"
+ python = env_dir / bindir / "python"
+ _run(str(python), "-m", "pip", "install", "--no-deps", str(wheel))
+ _run(
+ str(python),
+ "-c",
+ (
+ "import atomref as ar; "
+ "assert ar.get_covalent_radius('C') == 0.76; "
+ "assert ar.get_vdw_radius('C') == 1.77; "
+ "assert 'atomic_radius' in ar.list_quantities(); "
+ "assert 'rahm2016' in ar.list_dataset_ids("
+ "'atomic_radius', usage_role='support')"
+ ),
+ )
+
+
+def main() -> int:
+ """Run lint, tests, docs, build, metadata, and wheel smoke checks."""
+
+ parser = argparse.ArgumentParser(
+ description="Run the full release-preparation checks for the repository.",
+ )
+ parser.add_argument(
+ "--skip-docs",
+ action="store_true",
+ help="skip the strict MkDocs build step",
+ )
+ parser.add_argument(
+ "--skip-smoke-test",
+ action="store_true",
+ help="skip the temporary-virtualenv wheel import smoke test",
+ )
+ args = parser.parse_args()
+
+ _run("flake8", "src", "tests", "tools")
+ _run(sys.executable, "tools/check_registry.py")
+ _run(sys.executable, "tools/check_notebooks.py")
+ _run(sys.executable, "tools/export_notebooks.py", "--check")
+ _run(sys.executable, "tools/gen_readme.py", "--check")
+ _run(sys.executable, "-m", "pytest", "-q")
+ if not args.skip_docs:
+ _run("mkdocs", "build", "--strict")
+
+ _fresh_build_dirs()
+ _run(sys.executable, "-m", "build")
+ _run(sys.executable, "-m", "twine", "check", "dist/*")
+ _run(sys.executable, "tools/check_dist.py", "dist")
+ if not args.skip_smoke_test:
+ _smoke_test_wheel()
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())