diff --git a/ChangeLog.rst b/ChangeLog.rst
index 8a7efa9..5ea8a37 100644
--- a/ChangeLog.rst
+++ b/ChangeLog.rst
@@ -23,6 +23,13 @@ Known issues
Change history
==============
+2026-05-21 R2.2.0
+-----------------
+
+Modified:
+
+* Use lark for better error reporting from the formula parser
+
2026-02-27 R2.1.0
-----------------
diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py
index dad6c9e..d65594d 100644
--- a/doc/sphinx/conf.py
+++ b/doc/sphinx/conf.py
@@ -27,6 +27,7 @@
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('_extensions'))
import periodictable
+from periodictable.lark_parse import grammar
# -- General configuration -----------------------------------------------------
@@ -65,6 +66,11 @@
('py:class', 'numpy._typing._array_like._ScalarT'),
('py:class', 'numpy._typing._nested_sequence._NestedSequence'),
('py:class', 'pyparsing.core.ParserElement'),
+ ('py:class', 'lark.tree.Tree'),
+ ('py:class', 'lark.lexer.Token'),
+ ('py:class', 'lark.visitors.Transformer'),
+ ('py:class', 'lark.visitors._Leaf_T'),
+ ('py:class', 'lark.visitors._Return_T'),
('py:class', 'periodictable.core._AtomBase'),
('py:class', 'periodictable.core.IonSet'),
@@ -300,4 +306,3 @@
if os.path.exists('rst_prolog'):
with io.open('rst_prolog', encoding='utf-8') as fid:
rst_prolog = fid.read()
-
diff --git a/doc/sphinx/genmods.py b/doc/sphinx/genmods.py
index 9cdc46a..4b930e5 100644
--- a/doc/sphinx/genmods.py
+++ b/doc/sphinx/genmods.py
@@ -55,6 +55,7 @@ def genfiles(package, package_name, modules, dir='api'):
#('__init__', 'Top level namespace'),
('core', 'Core table'),
('formulas', 'Chemical formula operations'),
+ ('lark_parse', 'Chemical formula parser'),
('covalent_radius', 'Covalent radius'),
('constants', 'Fundamental constants'),
('crystal_structure', 'Crystal structure'),
diff --git a/doc/sphinx/guide/formula_grammar.rst b/doc/sphinx/guide/formula_grammar.rst
index db1ab2c..014b795 100644
--- a/doc/sphinx/guide/formula_grammar.rst
+++ b/doc/sphinx/guide/formula_grammar.rst
@@ -159,28 +159,68 @@ The grammar used for parsing formula strings is the following:
::
- formula :: compound | mixture | nothing
- mixture :: quantity | percentage
- quantity :: number unit part ('//' number unit part)*
- percentage :: number 'wt%|vol%' part ('//' number '%' part)* '//' part
- part :: compound | '(' mixture ')'
- compound :: (composite | fasta) density?
- fasta :: ('dna' | 'rna' | 'aa') ':' [A-Z -*]+
- composite :: group (separator group)*
- group :: number element+ | '(' formula ')' number
- element :: symbol isotope? ion? number?
- symbol :: [A-Z][a-z]*
- isotope :: '[' integer ']'
- ion :: '{' integer? [+-] '}'
- density :: '@' number [ni]?
- number :: integer | fraction
- integer :: [1-9][0-9]*
- fraction :: ([1-9][0-9]* | 0)? '.' [0-9]*
- separator :: space? '+'? space?
- unit :: mass | volume | length
- mass :: 'kg' | 'g' | 'mg' | 'ug' | 'ng'
- volume :: 'L' | 'mL' | 'uL' | 'nL'
- length :: 'cm' | 'mm' | 'um' | 'nm'
+ # formula: composite @ density | str:sequence @ density | mixture
+ formula : compound | mixture
+ compound : (composite | fasta) [density]
+ # Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
+ # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
+
+ # Activation only cares about total mass, so you can freely mix masses and volumes if
+ # you have the density for each component. For scattering you need the density of the
+ # mixture. When this is different from the mixture of densities use (mixture)@density.
+ # For thin film samples, allow stacking of layers with the thickness of each layer.
+ # With density for each layer the relative quantities of each element in the stack can
+ # be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²).
+
+ # mixture: quantity compound // quantity compound // ...
+ mixture : byamount | byvolume | byweight | layers
+ byamount : quantity compound (MIX quantity compound)*
+ byvolume : volumepct compound (MIX percentage compound)* MIX compound
+ byweight : weightpct compound (MIX percentage compound)* MIX compound
+ layers : thickness compound (MIX thickness compound)*
+ quantity : NUMBER SPACE? (MASS | VOLUME) SPACE
+ weightpct : NUMBER SPACE? WEIGHTPCT SPACE
+ volumepct : NUMBER SPACE? VOLUMEPCT SPACE
+ thickness : NUMBER SPACE? LENGTH SPACE
+ percentage : NUMBER SPACE? "%" SPACE # Allows "3 % "
+ MIX : SPACE? "//" SPACE?
+ WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+ VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+ MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+ VOLUME : "L" | "mL" | "uL" | "μL" | "nL"
+ LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
+
+ # FASTA sequence: (rna|dna|aa):SEQUENCE @ density
+ fasta : FASTA ":" SEQUENCE
+ FASTA : /[a-z]+/ # str:sequence reports better errors than /dna|rna|aa/:sequence
+ SEQUENCE : /[-A-Z *]+/
+
+ # composite: number group number group ... @density
+ # group: El count El count ...
+ composite : [NUMBER] group (SEPARATOR [NUMBER] group)*
+ group : ((atom | isoatom | "(" formula ")") [COUNT])+
+ atom : SYMBOL [isotope] [valence]
+ isoatom : SUPERINT SYMBOL [valence] # For example ²H for deuterium
+ isotope : "[" INTEGER "]"
+ valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
+ density : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
+ # could list all elements, but better error reporting if element symbol lookup fails
+ SYMBOL : /[A-Z][a-z]*/
+ CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--}
+ SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻
+ DENSITY : NUMBER # using alias DENSITY for number for better error reporting
+ DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density
+ COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts
+ SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE # For example, CaCO₃·6H₂O
+
+ SPACE : /[ \\t\\n\\r]+/
+ NUMBER : INTEGER | FRACTION
+ INTEGER : /[1-9][0-9]*/
+ FRACTION : /([1-9][0-9]*|0)?[.][0-9]*/ # allow all floats?
+ SUBNUM : SUBINT | SUBFRAC
+ SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
+ SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
+ SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
Formulas can also be constructed from atoms or other formulas:
@@ -259,18 +299,26 @@ following is a 2:1 mixture of water and heavy water:
>>> H2O = formula('H2O',natural_density=1)
>>> D2O = formula('D2O',natural_density=1)
>>> mix = mix_by_volume(H2O,2,D2O,1)
- >>> print(f"{mix} {mix.density:.4g}")
- (H2O)2D2O 1.037
+ >>> print(f"{mix} @ {mix.density:.4g}")
+ (H2O)2D2O @ 1.037
-Note that this is different from a 2:1 mixture by weight:
+This is different from a 2:1 mixture by weight:
>>> mix = mix_by_weight(H2O,2,D2O,1)
- >>> print(f"{mix} {mix.density:.4g}")
- (H2O)2.22339D2O 1.035
+ >>> print(f"{mix} @ {mix.density:.4g}")
+ (H2O)2.22339D2O @ 1.035
Except in the simplest of cases, the density of the mixture cannot be
-computed from the densities of the components, and the resulting density
-should be set explicitly.
+computed from the densities of the components. Even when the component
+density is known the resulting density should be set explicitly:
+
+ >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9)
+ >>> print(f"{mix} @ {mix.density:.4g}")
+ NaCl(H2O)29.1956 @ 1.057
+ >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9, density=1.07)
+ >>> print(f"{mix} @ {mix.density:.4g}")
+ NaCl(H2O)29.1956 @ 1.07
+
Derived values
--------------
diff --git a/periodictable/core.py b/periodictable/core.py
index 534a258..fb095b3 100644
--- a/periodictable/core.py
+++ b/periodictable/core.py
@@ -398,9 +398,9 @@ def __init__(self, element_or_isotope: Element|Isotope):
def __getitem__(self, charge: int) -> Ion:
if charge not in self.ionset:
if charge not in self.element_or_isotope.ions:
- raise ValueError("%(charge)d is not a valid charge for %(symbol)s"
- % dict(charge=charge,
- symbol=self.element_or_isotope.symbol))
+ valence = f"{abs(charge)}{'-' if charge < 0 else '+'}"
+ symbol = self.element_or_isotope.symbol
+ raise ValueError(f"valence {valence} is not valid for {symbol}")
self.ionset[charge] = Ion(self.element_or_isotope, charge)
return self.ionset[charge]
diff --git a/periodictable/fasta.py b/periodictable/fasta.py
index b378477..f95d9ed 100644
--- a/periodictable/fasta.py
+++ b/periodictable/fasta.py
@@ -75,7 +75,7 @@
from collections.abc import Iterator
from typing import IO, cast
-from .formulas import formula as parse_formula, Formula, FormulaInput
+from .formulas import formula as make_formula, Formula, FormulaInput
from .nsf import neutron_sld
from .xsf import xray_sld
from .core import default_table, Atom
@@ -177,7 +177,7 @@ def __init__(
elements = default_table()
# Fill in density or cell_volume.
- M = parse_formula(formula, natural_density=density)
+ M = make_formula(formula, natural_density=density)
# CRUFT: use of T rather than H[1] is deprecated since 1.5.3
if elements.T in M.atoms:
warnings.warn("Use of tritium for labile hydrogen is deprecated."
@@ -274,7 +274,7 @@ def __init__(self, name: str, sequence: str, type: str='aa'):
structure.extend(list(p.labile_formula.structure))
# Add H + OH terminators to the sequence
structure.extend(((2, elements.H[1]), (1, elements.O)))
- formula = parse_formula(structure).hill
+ formula = make_formula(structure).hill
Molecule.__init__(
self, name, formula, cell_volume=cell_volume, charge=charge)
@@ -356,7 +356,7 @@ def _code_average(bases, code_table) -> tuple[Formula, float, float]:
Note: averaging can lead to a fractional charge on the returned molecule.
"""
n = len(bases)
- formula, cell_volume, charge = parse_formula(), 0., 0.
+ formula, cell_volume, charge = make_formula(), 0., 0.
for c in bases:
base = code_table[c]
formula += base.labile_formula
diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index f731c93..8c16a1b 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -6,18 +6,13 @@
from copy import copy
from math import pi, sqrt
-from typing import cast, Union, Any
+from typing import cast, Union, Any, Iterable, TYPE_CHECKING
from collections.abc import Sequence, Callable
-# Requires that the pyparsing module is installed.
-
-from pyparsing import (ParserElement, Literal, Optional, White, Regex,
- ZeroOrMore, OneOrMore, Forward, StringEnd, Group)
-
from .core import default_table, isatom, isisotope, ision, change_table
-from .core import Atom, Element, Isotope, Ion, PeriodicTable # for typing
+from .core import Atom, Isotope, Ion, PeriodicTable # for typing
from .constants import avogadro_number, electron_mass
-from .util import cell_volume
+from .util import cell_volume, unicode_subscript, unicode_superscript
FormulaInput = Union[str, "Formula", Atom, dict[Atom, float], Sequence[tuple[float, Any]], None]
Fragment = tuple[float, Union[Atom, "Structure"]]
@@ -89,7 +84,7 @@ def mix_by_weight(*args, **kw) -> "Formula":
result.name = name
return result
-def _mix_by_weight_pairs(pairs: list[tuple["Formula", float]]) -> "Formula":
+def _mix_by_weight_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula":
from .formulas import Formula # For running as __main__
# Drop pairs with zero quantity
@@ -175,7 +170,7 @@ def mix_by_volume(*args, **kw) -> "Formula":
result.name = name
return result
-def _mix_by_volume_pairs(pairs: list[tuple["Formula", float]]) -> "Formula":
+def _mix_by_volume_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula":
from .formulas import Formula # For running as __main__
# Drop pairs with zero quantity
@@ -227,7 +222,7 @@ def formula(
change in cell volume.
*name* : string
- Common name for the molecule.
+ Common name for the material.
*table* : PeriodicTable
Private table to use when parsing string formulas.
@@ -288,6 +283,7 @@ def formula(
display purposes.
"""
from .formulas import Formula # For running as __main__
+ from .lark_parse import parse_formula
structure: Structure
if compound is None or compound == '':
@@ -328,8 +324,25 @@ class Formula:
Simple chemical formula representation.
"""
structure: Structure
+ """Nested structure ((count, atom|structure), ...)"""
density: float|None
+ """
+ |g/cm^3|
+
+ Density of the material.
+ """
name: str|None
+ """
+ Name of the material. Default is the input string for the formula parser.
+ """
+ total_mass: float|None = None
+ """
+ For mixture by mass, the total mass of the mixture (g).
+ """
+ thickness: float|None = None
+ """
+ For mixture by layer, the total thickness of the mixture (cm).
+ """
def __init__(self,
structure: Structure=tuple(),
@@ -411,7 +424,7 @@ def natural_density(self) -> float | None:
"""
|g/cm^3|
- Density of the formula with specific isotopes of each element
+ Density of the material with specific isotopes of each element
replaced by the naturally occurring abundance of the element
without changing the cell volume.
"""
@@ -675,7 +688,8 @@ def __rmul__(self, other):
return ret
def __str__(self):
- return self.name if self.name else _str_atoms(self.structure)
+ # return self.name if self.name else "".join(_str_atoms(self.structure))
+ return "".join(_str_atoms(self.structure))
def __repr__(self):
return "formula('%s')"%(str(self))
@@ -709,296 +723,6 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti
density = compound.density
return formula(atoms, density=density)
-
-# TODO: Grammar should be independent of table
-# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix
-LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2}
-MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3}
-VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
-LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')'
-MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')'
-def formula_grammar(table: PeriodicTable) -> ParserElement:
- """
- Construct a parser for molecular formulas.
-
- :Parameters:
-
- *table* = None : PeriodicTable
- If table is specified, then elements and their associated fields
- will be chosen from that periodic table rather than the default.
-
- :Returns:
- *parser* : pyparsing.ParserElement.
- The ``parser.parse_string()`` method returns a list of
- pairs (*count, fragment*), where fragment is an *isotope*,
- an *element* or a list of pairs (*count, fragment*).
-
- """
- # TODO: fix circular imports
- # This ickiness is because the formula class returned from the circular
- # import of fasta does not match the local formula class.
- from .formulas import Formula
-
- # Recursive
- composite = Forward()
- mixture = Forward()
-
- # whitespace and separators
- space = Optional(White().suppress())
- separator = space+Literal('+').suppress()+space
-
- # Lookup the element in the element table
- symbol = Regex("[A-Z][a-z]?")
- symbol.set_parse_action(lambda s, l, t: table.symbol(t[0]))
-
- # Translate isotope
- openiso = Literal('[').suppress()
- closeiso = Literal(']').suppress()
- isotope = Optional(~White()+openiso+Regex("[1-9][0-9]*")+closeiso,
- default='0')
- isotope.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 0)
-
- # Translate ion
- openion = Literal('{').suppress()
- closeion = Literal('}').suppress()
- ion = Optional(~White() +openion +Regex("([1-9][0-9]*)?[+-]") +closeion,
- default='0+')
- ion.set_parse_action(lambda s, l, t: int(t[0][-1]+(t[0][:-1] if len(t[0]) > 1 else '1')))
-
- # Translate counts
- # TODO: regex should reject a bare '.' if we want to allow dots between formula parts
- fract = Regex("(0|[1-9][0-9]*|)([.][0-9]*)")
- fract.set_parse_action(lambda s, l, t: float(t[0]) if t[0] else 1)
- whole = Regex("(0|[1-9][0-9]*)")
- whole.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 1)
- number = Optional(~White()+(fract|whole), default=1)
- # TODO use unicode ₀₁₉ in the code below?
- sub_fract = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)")
- sub_fract.set_parse_action(lambda s, l, t: float(from_subscript(t[0])) if t[0] else 1)
- sub_whole = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*)")
- sub_whole.set_parse_action(lambda s, l, t: int(from_subscript(t[0])) if t[0] else 1)
- sub_count = Optional(~White()+(fract|whole|sub_fract|sub_whole), default=1)
-
- # Fasta code
- fasta = Regex("aa|rna|dna") + Literal(":").suppress() + Regex("[A-Z *-]+")
- def convert_fasta(string, location, tokens):
- #print("fasta", string, location, tokens)
- # TODO: fasta is ignoring table when parsing
- # TODO: avoid circular imports
- # TODO: support other biochemicals (carbohydrate residues, lipids)
- from . import fasta
- seq_type, seq = tokens
- if seq_type not in fasta.CODE_TABLES:
- raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
- seq = fasta.Sequence(name=None, sequence=seq, type=seq_type)
- return seq.labile_formula
- fasta.set_parse_action(convert_fasta)
-
- # Convert symbol, isotope, ion, count to (count, isotope)
- element = symbol+isotope+ion+sub_count
- def convert_element(string, location, tokens):
- """interpret string as element"""
- #print "convert_element received", tokens
- symbol, isotope, ion, count = tokens[0:4]
- if isotope != 0:
- symbol = symbol[isotope]
- if ion != 0:
- symbol = symbol.ion[ion]
- return (count, symbol)
- element.set_parse_action(convert_element)
-
- # Convert "count elements" to a pair
- implicit_group = number+OneOrMore(element)
- def convert_implicit(string, location, tokens):
- """convert count followed by fragment"""
- #print "implicit", tokens
- count = tokens[0]
- fragment = tokens[1:]
- return fragment if count == 1 else (count, fragment)
- implicit_group.set_parse_action(convert_implicit)
-
- # Convert "(composite) count" to a pair
- opengrp = space + Literal('(').suppress() + space
- closegrp = space + Literal(')').suppress() + space
- explicit_group = opengrp + composite + closegrp + sub_count
- def convert_explicit(string, location, tokens):
- """convert (fragment)count"""
- #print "explicit", tokens
- count = tokens[-1]
- fragment = tokens[:-1]
- return fragment if count == 1 else (count, fragment)
- explicit_group.set_parse_action(convert_explicit)
-
- # Build composite from a set of groups
- group = implicit_group | explicit_group
- implicit_separator = separator | space
- composite << group + ZeroOrMore(implicit_separator + group)
-
- density = Literal('@').suppress() + number + Optional(Regex("[ni]"), default='i')
- compound = (composite|fasta) + Optional(density, default=None)
- def convert_compound(string, location, tokens):
- """convert material @ density or fasta @ density"""
- # Messiness: both composite and density can be one or more tokens
- # If density is missing then it is None, otherwise it is count + [ni]
- # Compound can be a sequence of (count, fragment) pairs, or if it is
- # a fasta sequence it may already be a formula.
- material = tokens[:-1] if tokens[-1] is None else tokens[:-2]
- #print("compound", material, type(material[0]), len(material))
- if len(material) == 1 and isinstance(material[0], Formula):
- formula = material[0]
- else:
- #print("unbundling material", material)
- formula = Formula(structure=_immutable(material))
- density, form = (None, None) if tokens[-1] is None else tokens[-2:]
- #if density is None and formula.density is None:
- # # Estimate density from covalent radii and a 0.54 packing factor
- # mass = formula.molecular_mass
- # volume = formula.volume(packing_factor=0.54, H_radius=1.15)
- # density, form = mass/volume, 'n'
- # print(f"estimating density as {mass/volume=:.3f}")
- if form == 'n':
- formula.natural_density = density
- elif form == 'i':
- formula.density = density
- #print("compound", formula, f"{formula.density=:.3f}")
- return formula
- compound.set_parse_action(convert_compound)
-
- partsep = space + Literal('//').suppress() + space
- percent = Literal('%').suppress()
- weight = Regex("(w((eigh)?t)?|m(ass)?)").suppress()
- volume = Regex("v(ol(ume)?)?").suppress()
- weight_percent = (percent + weight) | (weight + percent) + space
- volume_percent = (percent + volume) | (volume + percent) + space
- mixture_by_weight = (number + weight_percent + mixture
- + ZeroOrMore(partsep+number+(weight_percent|percent)+mixture)
- + Optional(partsep + mixture, default=None))
- def _parts_by_weight_vol(tokens):
- #print("by weight or volume", tokens)
- if tokens[-1] is None:
- piece = tokens[1:-1:2]
- fract = [float(v) for v in tokens[:-1:2]]
- if abs(sum(fract) - 100) > 1e-12:
- raise ValueError(f"Formula percentages must sum to 100%, not {sum(fract)}")
- else:
- piece = tokens[1:-1:2] + [tokens[-1]]
- fract = [float(v) for v in tokens[:-1:2]]
- fract.append(100-sum(fract))
- if fract[-1] < 0:
- raise ValueError("Formula percentages must sum to less than 100%")
- #print piece, fract
- if len(piece) != len(fract):
- raise ValueError("Missing base component of mixture")
- return piece, fract
- def convert_by_weight(string, location, tokens):
- """convert mixture by wt% or mass%"""
- piece, fract = _parts_by_weight_vol(tokens)
- return _mix_by_weight_pairs(zip(piece, fract))
- mixture_by_weight.set_parse_action(convert_by_weight)
-
- mixture_by_volume = (number + volume_percent + mixture
- + ZeroOrMore(partsep+number+(volume_percent|percent)+mixture)
- + Optional(partsep + mixture, default=None))
- def convert_by_volume(string, location, tokens):
- """convert mixture by vol%"""
- piece, fract = _parts_by_weight_vol(tokens)
- return _mix_by_volume_pairs(zip(piece, fract))
- mixture_by_volume.set_parse_action(convert_by_volume)
-
- mixture_by_layer = Forward()
- layer_thick = Group(number + Regex(LENGTH_RE) + space)
- layer_part = (layer_thick + mixture) | (opengrp + mixture_by_layer + closegrp + sub_count)
- mixture_by_layer << layer_part + ZeroOrMore(partsep + layer_part)
- def convert_by_layer(string, location, tokens):
- """convert layer thickness '# nm material'"""
- if len(tokens) < 2:
- return tokens
- piece = []
- fract = []
- for p1, p2 in zip(tokens[0::2], tokens[1::2]):
- if isinstance(p1, Formula):
- f = p1.thickness * float(p2)
- p = p1
- else:
- f = float(p1[0]) * LENGTH_UNITS[p1[1]]
- p = p2
- piece.append(p)
- fract.append(f)
- total = sum(fract)
- vfract = [(v/total)*100 for v in fract]
- result = _mix_by_volume_pairs(zip(piece, vfract))
- result.thickness = total
- return result
- mixture_by_layer.set_parse_action(convert_by_layer)
-
- mixture_by_absmass = Forward()
- absmass_mass = Group(number + Regex(MASS_VOLUME_RE) + space)
- absmass_part = (absmass_mass + mixture) | (opengrp + mixture_by_absmass + closegrp + sub_count)
- mixture_by_absmass << absmass_part + ZeroOrMore(partsep + absmass_part)
- def convert_by_absmass(string, location, tokens):
- """convert mass '# mg material'"""
- if len(tokens) < 2:
- return tokens
- piece = []
- fract = []
- for p1, p2 in zip(tokens[0::2], tokens[1::2]):
- if isinstance(p1, Formula):
- p = p1
- f = p1.total_mass * float(p2)
- else:
- p = p2
- value = float(p1[0])
- if p1[1] in VOLUME_UNITS:
- # convert to volume in liters to mass in grams before mixing
- if p.density is None:
- raise ValueError("Need the mass density of "+str(p))
- f = value * VOLUME_UNITS[p1[1]] * 1000.*p.density
- else:
- f = value * MASS_UNITS[p1[1]]
- piece.append(p)
- fract.append(f)
-
- total = sum(fract)
- mfract = [(m/total)*100 for m in fract]
- result = _mix_by_weight_pairs(zip(piece, mfract))
- result.total_mass = total
- return result
- mixture_by_absmass.set_parse_action(convert_by_absmass)
-
- ungrouped_mixture = (mixture_by_weight | mixture_by_volume
- | mixture_by_layer | mixture_by_absmass)
- grouped_mixture = opengrp + ungrouped_mixture + closegrp + Optional(density, default=None)
- def convert_mixture(string, location, tokens):
- """convert (mixture) @ density"""
- formula = tokens[0]
- if tokens[-1] == 'n':
- formula.natural_density = tokens[-2]
- elif tokens[-1] == 'i':
- formula.density = tokens[-2]
- # elif tokens[-1] is None
- return formula
- grouped_mixture.set_parse_action(convert_mixture)
-
- mixture << (compound | grouped_mixture)
- formula = (compound | ungrouped_mixture | grouped_mixture)
- grammar = Optional(formula, default=Formula()) + StringEnd()
-
- grammar.set_name('Chemical Formula')
- return grammar
-
-_PARSER_CACHE: dict[PeriodicTable, ParserElement] = {}
-def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
- """
- Parse a chemical formula, returning a structure with elements from the
- given periodic table.
- """
- table = default_table(table)
- if table not in _PARSER_CACHE:
- _PARSER_CACHE[table] = formula_grammar(table)
- parser = _PARSER_CACHE[table]
- #print(parser)
- return parser.parse_string(formula_str)[0]
-
def _count_atoms(seq: Structure) -> dict[Atom, float]:
"""
Traverse formula structure, counting the total number of atoms.
@@ -1073,97 +797,45 @@ def _convert_to_hill_notation(atoms: dict[Atom, float]) -> Structure:
"""
return tuple((atoms[el], el) for el in sorted(atoms.keys(), key=_hill_key))
-def _str_one_atom(fragment: Atom) -> str:
- # Normal isotope string form is #-Yy, but we want Yy[#]
- if isisotope(fragment) and 'symbol' not in fragment.__dict__:
- ret = "%s[%d]"%(fragment.symbol, cast(Isotope, fragment).isotope)
- else:
- ret = fragment.symbol
- if fragment.charge != 0:
- sign = '+' if fragment.charge > 0 else '-'
- value = str(abs(fragment.charge)) if abs(fragment.charge) > 1 else ''
- ret += '{'+value+sign+'}'
- return ret
+def _str_one_atom(atom: Atom) -> str:
+ """
+ Format a single atom as SYMBOL[ISOTOPE]{VALENCE}.
-# TODO: add typing to _str_atoms
-def _str_atoms(seq) -> str:
+ Can't use str(atom) => ISOTOPE-SYMBOL{VALENCE} or repr(atom) => SYMBOL[ISOTOPE].ion[VALENCE]
+ """
+ valence = isotope = ""
+ if ision(atom):
+ ion = cast(Ion, atom)
+ charge = '-' if ion.charge < 0 else '+'
+ magnitude = abs(ion.charge)
+ valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}"
+ valence = "{%s}"%valence
+ atom = ion.element
+ if isisotope(atom):
+ iso = cast(Isotope, atom)
+ if iso.symbol == iso.element.symbol:
+ isotope = f"[{iso.isotope}]"
+ return f"{atom.symbol}{isotope}{valence}"
+
+def _str_atoms(seq) -> list[str]:
"""
Convert formula structure to string.
"""
#print "str", seq
- ret = ""
+ ret = []
for count, fragment in seq:
if isatom(fragment):
- ret += _str_one_atom(fragment)
+ ret.append(_str_one_atom(fragment))
if count != 1:
- ret += "%g"%count
+ ret.append(f"{count:g}")
+ elif count == 1:
+ ret.extend(_str_atoms(fragment))
else:
- if count == 1:
- piece = _str_atoms(fragment)
- else:
- piece = "(%s)%g"%(_str_atoms(fragment), count)
- #ret = ret+" "+piece if ret else piece
- ret += piece
+ ret.extend(("(", *_str_atoms(fragment), ")", f"{count:g}"))
return ret
-def from_subscript(value: str) -> str:
- """
- Convert unicode subscript characters to normal characters. This allows us to parse,
- for example, H₂O as H2O.
- """
- subscript_codepoints = {
- '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
- '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7',
- '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-',
- '\u208c': '=', '\u208d': '(', '\u208e': ')',
-
- '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x',
- '\u2095': 'h', '\u2096': 'k', '\u2097': 'l',
- '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's',
- '\u209c': 't',
- }
- return ''.join(subscript_codepoints.get(char, char) for char in str(value))
-
-def unicode_subscript(value: str) -> str:
- # Unicode subscript codepoints. Note that decimal point looks okay as subscript
- subscript_codepoints = {
- '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083',
- '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087',
- '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b',
- '=': '\u208c', '(': '\u208d', ')': '\u208e',
-
- 'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093',
- 'h': '\u2095', 'k': '\u2096', 'l': '\u2097',
- 'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b',
- 't': '\u209c',
-
- '\u2013': '\u208b', # en-dash is same as dash
- '\u2014': '\u208b', # em-dash is same as dash
- }
- return ''.join(subscript_codepoints.get(char, char) for char in str(value))
-
-def unicode_superscript(value: str) -> str:
- # Unicode subscript codepoints. Note that decimal point looks okay as subscript
- superscript_codepoints = {
- #'.': '\u00B0', # degree symbol looks too much like zero
- #'.': ' \u02D9', # dot above modifier looks okay in a floating string, but risky
- #'.': ' \u0307', # space with dot above?
- #'.': '\u22C5', # math dot operator
- '.': '\u1427', # Canadian aboriginal extended block dot (looks good on mac)
- '2': '\u00B2', '3': '\u00B3',
- '1': '\u00B9',
- '0': '\u2070', 'i': '\u2071',
- '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077',
- '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b',
- '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f',
-
- '\u2013': '\u207b', # en-dash is same as dash
- '\u2014': '\u207b', # em-dash is same as dash
- }
- return ''.join(superscript_codepoints.get(char, char) for char in str(value))
-
SUBSCRIPT: dict[str, Callable[[str], str]] = {
# The latex renderer should work for github style markdown
'latex': lambda text: f'$_{{{text}}}$',
@@ -1171,32 +843,80 @@ def unicode_superscript(value: str) -> str:
'unicode': unicode_subscript,
'plain': lambda text: text
}
-def pretty(compound: Formula, mode: str='unicode') -> str:
+SUPERSCRIPT: dict[str, Callable[[str], str]] = {
+ # The latex renderer should work for github style markdown
+ 'latex': lambda text: f'$^{{{text}}}$',
+ 'html': lambda text: f'{text}',
+ 'unicode': unicode_superscript,
+ 'plain': lambda text: text,
+}
+
+class PrettyFormula:
"""
- Convert the formula to a string. The *mode* can be 'unicode', 'html' or
- 'latex' depending on how subscripts should be rendered. If *mode* is 'plain'
- then don't use subscripts for the element quantities.
+ Formula pretty-printer.
- Use *pretty(compound.hill)* for a more compact representation.
+ Formats formuls for output, using superscripts for isotope and valence and
+ subscripts for element counts.
+
+ *mode* is unicode, latex, html or plain for no special formatting.
"""
- return _pretty(compound.structure, SUBSCRIPT[mode])
-
-# TODO: type hinting for _pretty
-def _pretty(structure, subscript: Callable[[str], str]) -> str:
- # TODO: if superscript is not None then render O[16] as {}^{16}O
- parts = []
- for count, part in structure:
- if isinstance(part, tuple):
- if count == 1:
- parts.append(_pretty(part, subscript))
- else:
- parts.append(f'({_pretty(part, subscript)}){subscript(count)}')
- elif count == 1:
- parts.append(f'{_str_one_atom(part)}')
+ mode: str
+ superscript: Callable[[str], str]
+ subscript: Callable[[str], str]
+
+ def __init__(self, mode):
+ self.mode = mode
+ self.subscript = SUBSCRIPT[mode]
+ self.superscript = SUPERSCRIPT[mode]
+
+ def walk_atom(self, atom):
+ if ision(atom):
+ charge = '-' if atom.charge < 0 else '+'
+ magnitude = abs(atom.charge)
+ valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}"
+ valence = self.superscript(valence)
+ atom = atom.element
+ else:
+ valence = ""
+ if isisotope(atom) and atom.symbol == atom.element.symbol:
+ isotope = self.superscript(str(atom.isotope))
else:
- parts.append(f'{_str_one_atom(part)}{subscript(count)}')
- return ''.join(parts)
+ isotope = ""
+ return f"{isotope}{atom.symbol}{valence}"
+
+ def format(self, compound: Formula):
+ if self.mode == 'plain':
+ return str(compound)
+ return self.walk(compound.structure)
+
+ def walk(self, structure):
+ parts = []
+ for count, part in structure:
+ if isinstance(part, tuple):
+ if count == 1:
+ parts.append(self.walk(part))
+ else:
+ parts.append(f'({self.walk(part)}){self.subscript(count)}')
+ elif count == 1:
+ parts.append(self.walk_atom(part))
+ else:
+ parts.append(f'{self.walk_atom(part)}{self.subscript(count)}')
+ return ''.join(parts)
+
+def pretty(compound: Formula, mode: str='unicode') -> str:
+ """
+ Convert the formula to a string.
+
+ *mode* is unicode, html, latex, plain [default = unicode]
+
+ If *mode* is 'plain' then don't use superscipts and subscripts for rendering.
+
+ Use *pretty(compound.hill)* for a more compact representation.
+ """
+ if mode is None:
+ mode = 'unicode'
+ return PrettyFormula(mode).format(compound)
def demo():
import sys
diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
new file mode 100644
index 0000000..f19efd0
--- /dev/null
+++ b/periodictable/lark_parse.py
@@ -0,0 +1,902 @@
+from typing import cast
+
+import lark
+
+from .core import PeriodicTable, Element, Atom, Isotope
+from .core import default_table
+from .formulas import (
+ Formula, Structure,
+ _mix_by_weight_pairs, _mix_by_volume_pairs,
+ pretty as pretty_formula
+)
+from .util import from_subscript, from_superscript
+
+# TODO: valence belongs to a group rather than element
+
+# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix
+LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10}
+MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3}
+VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
+
+# TODO: use grammar string directly in the sphinx/guide/formula_grammar.rst
+# Any changes to the grammar below should be copied to formula_grammar.rst
+grammar = """
+start : SPACE? formula SPACE? # strip blank space from start and end
+
+# formula: composite @ density | str:sequence @ density | mixture
+formula : compound | mixture
+compound : (composite | fasta) [density]
+# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
+# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
+
+# Activation only cares about total mass, so you can freely mix masses and volumes if
+# you have the density for each component. For scattering you need the density of the
+# mixture. When this is different from the mixture of densities use (mixture)@density.
+# For thin film samples, allow stacking of layers with the thickness of each layer.
+# With density for each layer the relative quantities of each element in the stack can
+# be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²).
+
+# mixture: quantity compound // quantity compound // quantity compound
+mixture : byamount | byvolume | byweight | layers
+byamount : quantity compound (MIX quantity compound)*
+byvolume : volumepct compound (MIX percentage compound)* MIX compound
+byweight : weightpct compound (MIX percentage compound)* MIX compound
+layers : thickness compound (MIX thickness compound)*
+quantity : NUMBER SPACE? (MASS | VOLUME) SPACE
+weightpct : NUMBER SPACE? WEIGHTPCT SPACE
+volumepct : NUMBER SPACE? VOLUMEPCT SPACE
+thickness : NUMBER SPACE? LENGTH SPACE
+percentage : NUMBER SPACE? "%" SPACE # Allows "3 % "
+MIX : SPACE? "//" SPACE?
+WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+VOLUME : "L" | "mL" | "uL" | "μL" | "nL"
+LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
+
+# FASTA sequence: (rna|dna|aa):SEQUENCE @ density
+fasta : FASTA ":" SEQUENCE
+FASTA : /[a-z]+/ # str:sequence reports better errors than /dna|rna|aa/:sequence
+SEQUENCE : /[-A-Z *]+/
+
+# composite: number group number group ... @density
+# group: El count El count ...
+# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?`
+composite : [NUMBER] group (SEPARATOR [NUMBER] group)*
+group : ((atom | isoatom | "(" formula ")") [COUNT])+
+atom : SYMBOL [isotope] [valence]
+isoatom : SUPERINT SYMBOL [valence] # For example ²H for deuterium
+isotope : "[" INTEGER "]"
+valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
+density : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
+# could list all elements, but better error reporting if element symbol lookup fails
+SYMBOL : /[A-Z][a-z]*/
+CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--}
+SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻
+DENSITY : NUMBER # using alias DENSITY for number for better error reporting
+DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density
+COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts
+SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE # For example, CaCO₃·6H₂O
+
+SPACE : /[ \\t\\n\\r]+/
+NUMBER : INTEGER | FRACTION
+INTEGER : /[1-9][0-9]*/
+FRACTION : /([1-9][0-9]*|0)?[.][0-9]*/ # allow all floats?
+SUBNUM : SUBINT | SUBFRAC
+SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
+SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
+SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
+"""
+
+# propagate_positions saves start_pos and end_pos for each rule as well as each terminal.
+formula_parser = lark.Lark(grammar, propagate_positions=True)
+
+def int_or_float(s):
+ f = float(s)
+ i = int(f)
+ return i if i == f else f
+
+class StripPunctuation(lark.Transformer):
+ """
+ Token stripper visitor class.
+
+ This is done separately from the formula composer so that we can show the cleaned tree
+ before debugging the conversion.
+
+ Unnamed punctuation characters []{}():% and units (kg, mL, nm, ...) which are represented
+ as quoted strings in the grammar have no associated token.
+
+ Note: could get the same effect by renaming the unused terminals with leading underscore,
+ but that makes the grammar harder to read.
+ """
+ def SEPARATOR(self, _):
+ """Strip token for molecular fragment separator (+ or center dot or spaces)."""
+ return lark.Discard
+ def MIX(self, _):
+ """Strip token for mixture separator //."""
+ return lark.Discard
+ def SPACE(self, _):
+ """Strip token for (usually optional) spaces."""
+ return lark.Discard
+ def WEIGHTPCT(self, _):
+ """Strip token for wt% mixture indicator."""
+ return lark.Discard
+ def VOLUMEPCT(self, _):
+ """Strip token for vol% mixture indicator."""
+ return lark.Discard
+
+class ConvertTokens(lark.Transformer):
+ """
+ Syntax tree to formula conversion class.
+ """
+ def __init__(self, text, table=None):
+ """
+ *text* is the original formula string.
+
+ *table* is an optional alternative periodic table.
+ """
+ self._context = text
+ self._table = default_table(table)
+
+ def VOLUME(self, token: lark.Token) -> tuple[str, str]:
+ """
+ Convert VOLUME terminal ('volume', unit) pair.
+
+ Unit is a volume unit, such as mL or uL for microlitres.
+ """
+ return 'volume', token.value
+ def MASS(self, token: lark.Token) -> tuple[str, str]:
+ """
+ Convert MASS terminal to ('mass', unit) pair.
+
+ Unit is a mass unit, such as g or mg.
+ """
+ return 'mass', token.value
+ def LENGTH(self, token: lark.Token) -> tuple[str, str]:
+ """
+ Convert LENGTH terminal to ('length', unit) pair.
+
+ Unit is a length unit, such as cm or nm.
+ """
+ return 'length', token.value
+ def NUMBER(self, token: lark.Token) -> int|float:
+ """
+ Convert string to float or integer.
+
+ Numbers are used for quantities and percentages in mixtures, and for multiplier
+ counts to molecule fragments.
+ """
+ return int_or_float(token.value)
+ DENSITY = NUMBER # We've aliased DENSITY and NUMBER in the grammar
+ def INTEGER(self, token: lark.Token) -> int:
+ """
+ Convert string to float or integer
+ """
+ return int(token.value)
+ def COUNT(self, token: lark.Token) -> int|float:
+ """
+ Return the count value for a group component.
+
+ Count is specified after the symbol, either as an ASCII number or using subscript digits.
+ The period separator for fractional counts uses ASCII in both cases (there is no subscript
+ period charcter available). If the count is fractional return it as a float, otherwise
+ return it as an integer.
+ """
+ return int_or_float(from_subscript(token.value))
+ def SUPERINT(self, token) -> int:
+ """
+ Return the integer value of a sequence of superscript digits.
+
+ This is used to specify the valence or to specify the isotope.
+ """
+ return int(from_superscript(token.value))
+ def DENSITYMODE(self, token) -> str:
+ """
+ Return the value of the DENSITYMODE token, either "n" or "i". If no mode is specified
+ then a token value of None will be given to the density rule.
+ """
+ return token.value
+ def CHARGE(self, token) -> int:
+ """
+ Return a sequence of plus and minus characters. By grammar rules they must all have
+ the same sign.
+
+ This is used in the valence rule to specify the charge for the atom.
+ """
+ return token.value
+ def SUPERCHARGE(self, token) -> str:
+ """
+ Convert sequence of superscript plus and minus characters to ASCII plus and minus.
+
+ This is used in the valence rule to specify the charge for the atom.
+ """
+ return from_superscript(token.value)
+ def SYMBOL(self, token) -> Element:
+ """
+ Look up the element in the periodic table and return it.
+
+ Raise ValueError if the element doesn't exist.
+ """
+ try:
+ return self._table.symbol(token.value)
+ except Exception:
+ raise ValueError(f"Element {token.value} doesn't exist")
+ def FASTA(self, token) -> str:
+ """
+ Return the token value as the fasta sequence type: "dna", "rna" or "aa".
+ """
+ return token.value
+ def SEQUENCE(self, token) -> str:
+ """
+ Return the token value as the fasta sequence string.
+ """
+ return token.value
+ def isotope(self, tokens) -> int:
+ """
+ Return the isotope number for the atom.
+
+ Transform: [isotope] => isotope
+ """
+ return tokens[0]
+ def valence(self, tokens) -> int:
+ """
+ Return valence from number and sign.
+
+ Valence is either a number followed by plus or minus, or a sequence of plus
+ or minus. If the number was specified it will already have been converted
+ to a value, otherwise use the length of the charge string as the value.
+
+ The valence can be given using superscript or regular ASCII number and sign
+ symbols. If ASCII then they need to be wrapped in braces such as Ca{2+}. The
+ token transform handles the conversion from superscript to ASCII characters
+ and the conversion from string to number.
+
+ Raise ValueError if a number was supplied along with multiple charge symbols.
+
+ Transform: [number|None, 'charge'] => valence
+
+ Example: ['{1+}'] => [1, '+'] = Ca.ion[1]
+
+ Example: Ca{++} => [None, '++'] = Ca.ion[2]
+
+ Example: Ca{3--} => ValueError
+ """
+ # print("in valence with", tokens)
+ value, charge = tokens
+ if value is None:
+ value = len(charge)
+ elif value and len(charge) > 1:
+ raise ValueError(f"Use {value}{charge[0]} instead of {value}{charge} for valence")
+ valence = value if charge[0] == '+' else -value
+ return valence
+ def atom(self, tokens) -> Atom:
+ """
+ Returns an atom from the periodic table.
+
+ Usually this will use elements from the default table, but if an alternate table is
+ provided to the ConvertTokens constructor then that will be used to retrieve the element
+ from the symbol.
+
+ Raises an error if the symbol does not exist, does not have that isotope or doesn't
+ allow that valence.
+
+ Transform: ['symbol', isotope|None, valence|None] => atom
+
+ Example: ['H', 1, 1] => H[1]{+}
+
+ Example: ['Ca', None, 2] => Ca{2+}
+ """
+ #print("atom", tokens)
+ el, iso, ion = tokens
+ if iso and ion:
+ atom = el[iso].ion[ion]
+ elif iso:
+ atom = el[iso]
+ elif ion:
+ atom = el.ion[ion]
+ else:
+ atom = el
+ #print(f"atom {tokens} => {atom}")
+ return atom
+
+ def isoatom(self, tokens) -> Atom:
+ """
+ Returns an isotope from the periodic table.
+
+ Usually this will use elements from the default table, but if an alternate table is
+ provided to the ConvertTokens constructor then that will be used to retrieve the element
+ from the symbol.
+
+ Raises an error if the symbol does not exist, does not have that isotope or doesn't
+ allow that valence.
+
+ Transform: [isotope, 'symbol', valence|None] => atom
+
+ Example ²H⁺: [2, 'H', 1] => D{+}
+ """
+ # print("isoatom", tokens)
+ iso, el, ion = tokens
+ atom = el[iso].ion[ion] if ion else el[iso]
+ # print(f"isoatom {tokens} => {atom}")
+ return atom
+
+
+ def group(self, tokens) -> Structure:
+ """
+ Returns a sequence of (count, item) pairs, where item is an atom or a nested formula.
+ Missing counts default to 1.
+
+ Transform: [atom|formula, count|None, ...] => ((count, atom|formula), ...)
+
+ Example CaCO3: [Ca, None, C, None, O, 3]
+ => ((1, Ca), (1, C), (3, O))
+ """
+ # print("group tokens", tokens)
+ tokens = [1 if value is None else value for value in tokens]
+ pairs = tuple((count, item) for item, count in zip(tokens[::2], tokens[1::2]))
+ # print("group output", pairs)
+ return pairs
+
+ def composite(self, tokens) -> Structure:
+ """
+ Returns a sequence of (number, group) pairs. Each group is a sequence of (count, item)
+ pairs, where item is an atom or a nested formula. Missing numbers default to 1.
+
+ Transform: [number|None, group, ...] => ((number, group), ...) | ((count, atom), ...)
+
+ Example CaCO3 6H2O: [None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))]
+ => ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O))))
+
+ Example CaCO3(H2O)6: [None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))]
+ => ((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),)
+
+ Example CaCO3 (H2O)6: [None, ((1, Ca), (1, C), (3, O)), None, ((6, formula('H2O')),)]
+ => ((1, ((1, Ca), (1, C), (3, O))), (1, ((6, formula('H2O')),)))
+ """
+ # print("composite tokens", tokens)
+ numbers = [1 if v is None else v for v in tokens[::2]]
+ groups = tokens[1::2]
+ pairs = tuple((number, group) for number, group in zip(numbers, groups))
+ # print("composite output", pairs)
+ return pairs
+
+ def fasta(self, tokens) -> Structure:
+ r"""
+ Returns the formula corresponding to the FASTA sequence, with the natural
+ density set. Labile hydrogen use H[1] in the formula.
+
+ The extra level of nesting in the return value is so that the fasta structure
+ is like a composite with a single group containing a nested formula.
+
+ Transform: [ 'aa|dna|rna', '[-A-Z \*]+' ] => (1, ((1, formula),))
+
+ Example: dna:CAGT: ['dna', 'CAGT'] x=> ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),)
+ """
+ # TODO: fasta is ignoring table when parsing
+ # TODO: avoid circular imports
+ # TODO: support other biochemicals (carbohydrate residues, lipids)
+ from periodictable.fasta import CODE_TABLES, Sequence
+
+ # print("fasta input", tokens)
+ seq_type, seq = tokens
+ if seq_type not in CODE_TABLES:
+ raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
+ seq = Sequence(name="seq", sequence=seq, type=seq_type)
+ pairs = ((1, seq.labile_formula),)
+ composite = ((1, pairs),)
+ # print("fasta output", composite)
+ # return tuple[tuple[int, tuple[tuple[int, Formula]]]] as Structure
+ return cast(Structure, composite)
+
+ def density(self, tokens) -> tuple[str, float, str]:
+ """
+ Returns a density tuple from the @density construct. Density mode 'n' for
+ natural or 'i' for isotopic defaults to isotopic. That is, D2O@1.11 is the
+ isotopic density of D2O, not the natural density of H2O with conversion to
+ the heavier deutrium isotope.
+
+ Transform: [value, mode|None] => ('density', value, mode)
+
+ Example @1.11: [1.11, None] => ('density', 1.11, 'i')
+
+ Example @1.11i: [1.11, 'i'] => ('density', 1.11, 'i')
+
+ Example @1n: [1, 'n'] => ('density', 1, 'n')
+ """
+ value = tokens[0]
+ mode = 'i' if not tokens[1] else tokens[1]
+ return 'density', value, mode
+
+ def compound(self, tokens) -> Formula:
+ """
+ Returns the formula for the compound, with optional density set.
+
+ Density is ('density', value, mode) or None, where mode is 'i' for isotopic density
+ or 'n' for natural density.
+
+ The compound may come from a FASTA spec, such as dna:CAGT or from a composite, such
+ as CaCO3+6H2O. The composite may include an embedded formula, such as CaCO3(H2O)6.
+ In any case, the resulting material token will be a sequence of (multiplier, group)
+ pairs, where each group is a sequence of (count, item) pairs. Each item may be an
+ atom or a formula. The fasta transform returns a single group with a single item.
+ As a nested sequence this is ((1, ((1, formula), ...)), ...), with nothing in the
+ ellipses.
+
+ Transform: [((number, group), ...), ('density', value, mode)|None] => formula
+
+ Example NaCl@2.16i: [((1, ((1, Na), (1, Cl))),), ('density', 2.16, 'i')] => NaCl@2.16i
+
+ Example dna:CAGT: [((1, ((1, formula('C39H37H[1]10N15O25P4')),)),), None] => C39H37H[1]10N15O25P4@1.69n
+
+ Example CaCO3 6H2O: [((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))), None] => CaCO3(H2O)6
+
+ Example CaCO3(H2O)6: [((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),), None] => CaCO3(H2O)6
+ """
+ # print("compound tokens", tokens)
+ components, density_tuple = tokens
+ if density_tuple is None:
+ density, density_mode = None, 'i'
+ else:
+ _, density, density_mode = density_tuple
+
+ # If a singleton formula with no density override then return it
+ # That is, [(1, ((1, formula),)), None] => formula
+ if density is None and len(components) == 1:
+ number, group = components[0]
+ if len(group) == 1 and number == 1:
+ count, item = group[0]
+ if count == 1 and isinstance(item, Formula):
+ # print("isolated formula with no density override")
+ return item
+
+ # Not an isolated formula, so expand formulas within the groups.
+ # That is, [..., (number, (..., (count, formula), ...)), ...]
+ # becomes [..., (number, (..., (count, formula.structure), ...)), ...]
+ def expand_formula(group):
+ return tuple((count, getattr(item, 'structure', item)) for count, item in group)
+ components = tuple((number, expand_formula(group)) for number, group in components)
+
+ # If it is a singleton group then use its structure as the formula structure.
+ if len(components) == 1 and components[0][0] == 1:
+ structure = components[0][1]
+ else:
+ structure = components
+
+ # Build the formula and assign density if available.
+ # print("compound structure", structure)
+ formula = Formula(structure=structure)
+ if density is not None:
+ if density_mode == 'n':
+ formula.natural_density = density
+ else:
+ formula.density = density
+
+ # print(f"compound output {formula} @ {formula.density}")
+ return formula
+
+ def weightpct(self, tokens) -> float:
+ """
+ Returns the percentage. The value has already be converted to a number.
+
+ Used as the first percentage of a mix by weight mixture.
+
+ Transform: [percent] => percent
+
+ Example for "3 wt%": [3] => 3
+ """
+ return tokens[0]
+
+ def volumepct(self, tokens) -> float:
+ """
+ Returns the percentage. The value has already be converted to a number.
+
+ Used as the first percentage of a mix by volume mixture.
+
+ Transform: [percent] => percent
+
+ Example for "3 vol%": [3] => 3
+ """
+ return tokens[0]
+
+ def percentage(self, tokens) -> float:
+ """
+ Returns the percentage. The value has already be converted to a number.
+
+ Transform: [percent] => percent
+
+ Example for " 3 % ": [3] => 3
+ """
+ return tokens[0]
+
+ def byweight(self, tokens) -> Formula:
+ """
+ Returns mixture by wt% of the various components in the system.
+
+ Raises ValueError if total exceeds 100%.
+
+ Transform: [percent, formula, ..., percent, formula, formula] => formula
+
+ Example: [76.95, D2O, H2O] => (D2O)3H2O
+ """
+ # TODO: structure not preserved in mixtures
+ total = sum(tokens[:-1:2])
+ if total > 100:
+ raise ValueError(f"Total weight {total}% is more than 100% in wt% mixture")
+ pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
+ pairs.append((tokens[-1], 100-total))
+ # return 'byweight', [*pairs, last_pair]
+ formula = _mix_by_weight_pairs(pairs)
+ # print(f"byweight => {formula} @ {formula.density}")
+ return formula
+
+ def byvolume(self, tokens) -> Formula:
+ """
+ Returns mixture by vol% of the various components in the system. Volumes are converted
+ to mass using density.
+
+ Raises ValueError if the density is missing from a component formula.
+ Raises ValueError if total exceeds 100%.
+
+ Transform: [percent, formula, ..., percent, formula, formula] => formula
+
+ Example: [75.0, D2O@1n, H2O@1n] => (D2O)3H2O
+ """
+ # print("by volume", tokens)
+ total = sum(tokens[:-1:2])
+ if total > 100:
+ raise ValueError(f"Total volume {total}% is more than 100% in vol% mixture")
+ pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
+ pairs.append((tokens[-1], 100-total))
+ # print("byvolume pairs", pairs)
+ # print("byvolume density", [f.density for f, p in pairs])
+ #return 'byvolume', pairs
+ formula = _mix_by_volume_pairs(pairs)
+ return formula
+
+ def byamount(self, tokens) -> Formula:
+ """
+ Returns mixture by mass of the various components in the system. Volumes are converted
+ to mass using density.
+
+ Raises ValueError if the density is missing from a component formula.
+
+ Transform: [quantity, formula, ...] => formula
+
+ Example: [('mass', 5.07, 'g'), NaCl@2.16, ('volume', 50, 'mL'), H2O@1n] => NaCl(H2O)32
+ """
+ # print("byamount", tokens)
+ def find_value(quantity, formula):
+ qtype, value, units = quantity
+ if qtype == 'volume':
+ if formula.density is None:
+ raise ValueError(f"Need the mass density of {formula}")
+ mass = value * VOLUME_UNITS[units] * 1000.0 * formula.density
+ else:
+ mass = value * MASS_UNITS[units]
+ return mass
+ values = [find_value(q, f) for q, f in zip(tokens[::2], tokens[1::2])]
+ total = sum(values)
+ percent = [(m/total)*100 for m in values]
+ formula = _mix_by_weight_pairs(zip(tokens[1::2], percent))
+ formula.total_mass = total
+ return formula
+
+ def layers(self, tokens) -> Formula:
+ """
+ Returns the mixture by volume of the various layers in the system.
+
+ Raises ValueError if the density is missing from a component formula.
+
+ Sets formula.thickness to the sum of the layer thicknesses.
+
+ Transform: [quantity, formula, ...] => formula
+
+ Example: [('length', 10.006, 'nm'), Ni, ('length', 3, 'mm'), Si] => NiSi164000
+ """
+ # # Sanity check: make sure all units are length units. This won't happen
+ # # because the parser only accepts proper formulas.
+ # assert all(units in LENGTH_UNITS for dim, value, units in tokens[::2])
+ values = [value*LENGTH_UNITS[units] for dim, value, units in tokens[::2]]
+ total = sum(values)
+ percent = [(m/total)*100 for m in values]
+ formula = _mix_by_volume_pairs(zip(tokens[1::2], percent))
+ formula.thickness = 100*total # convert meters to centimeters for cgs units
+ return formula
+
+ def mixture(self, tokens) -> Formula:
+ """
+ Returns the formula representing the mixture, either byweight, byvolume, byamount or layers
+
+ Transform: [formula] => formula
+ """
+ return tokens[0]
+
+ def formula(self, tokens) -> Formula:
+ """
+ Return the formula representing the compound or mixture.
+
+ Transform: [formula] => formula
+ """
+ return tokens[0]
+
+ def thickness(self, tokens) -> tuple[str, float, str]:
+ """
+ Returns (dimension, value, unit) with dimension equal 'length'
+
+ Transform: [value, ('length', unit)] => ('length', value, unit)
+
+ Example: [5, ('length', 'nm')] => ('length', 5, 'nm')
+ """
+ value, (dim, units) = tokens
+ return dim, value, units
+
+ def quantity(self, tokens) -> tuple[str, float, str]:
+ """
+ Returns (dimension, value, unit) with dimension equal 'mass' or 'volume'
+
+ Transform: [value, (dimension, unit)] => (dimension, value, unit)
+
+ Example: [5, ('mass', 'g')] => ('mass', 5, 'g')
+ """
+ value, (dim, units) = tokens
+ return dim, value, units
+
+ def start(self, tokens) -> Formula:
+ """
+ Return the final formula, with the original text attached.
+
+ Sets formula.name to the parser input string before returning.
+
+ Transform: [formula] => formula
+ """
+ formula = tokens[0]
+ # Remember the string which was parsed
+ formula.name = self._context
+ return formula
+
+# TODO: if the next character is ":" then report error as bad fasta sequence type
+def _allowed(allowed):
+ # * SPACE, SEPARATOR: Generally ignored
+ # * LPAR occurs whereever a symbol could be expected, so skip it
+ # * COLON: If asking then it probably thinks it is looking for a fasta sequence, but
+ # instead it should be looking for an element, so replace COLON with SYMBOL.
+ # * AT: Looking for @DENSITY
+ # * LPAR, RPAR: "(" and ")" are more readable
+ # * LSQB: end of element, looking for isotope, so skip
+ # * LBRACE, SUPERINT, SUPERCHARGE: end of element, looking for valence, so skip
+ skip = set("SPACE SEPARATOR LPAR LSQB LBRACE SUPERINT SUPERCHARGE".split())
+ # TODO: use order of elements in subst to sort the allowed list (currently alphabetical)
+ subst = dict(
+ NUMBER="NUMBER", # start of compound or start of mixture
+ #FASTA="[dna|rna|aa]:SEQ",
+ FASTA="aa:SEQ",
+ COLON=":SEQ",
+ #COLON="aa:SEQ",
+ SEQUENCE="aa:SEQ",
+ SEPARATOR="+", # generic group separator in composite
+ SPACE="SPACE",
+ SYMBOL="SYMBOL",
+ CHARGE="CHARGE[+-]",
+ LPAR='(',
+ RPAR=')',
+ LSQB='[',
+ RSQB=']',
+ LBRACE='{', # equivalent to SUPERINT and SUPERCHARGE
+ RBRACE='}',
+ VOLUMEPCT="vol%",
+ WEIGHTPCT="wt%",
+ MASS="UNIT[mg]",
+ VOLUME="UNIT[mL]",
+ LENGTH="UNIT[mm]",
+ PERCENT="%",
+ # I don't think all three of these can be concurrently allowed so no need to
+ # deduplicate. Moot since the set operation happens again after substition below.
+ AT="@DENSITY[ni]", # only the @ is expected, but better for doc
+ DENSITY="@DENSITY[ni]", # only the number is expected, but better for doc
+ DENSITYMODE="@DENSITY[ni]", # only the [ni] is expected, but better for doc
+ MIX="//",
+ # SUBNUM SUBINT SUBFRAC covered by COUNT
+ # INTEGER and FRACTION covered by NUMBER
+ # SUPERINT SUPERCHARGE LSQB LBRACE coexist with COUNT so stripped
+ SUPERCHARGE="SUPERSCRIPT[+-]", # If you see a superscript number then you need a sign
+ )
+ stripped = set(s for s in allowed if s not in skip)
+ if not stripped:
+ stripped = allowed
+ # Perform substitution for document strings
+ stripped = set(subst.get(s, s) for s in stripped)
+ if len(stripped) > 1:
+ message = f"one of {' '.join(sorted(stripped))}"
+ elif stripped:
+ message = [*stripped][0]
+ else:
+ # This occurs when the middle part of percent mixtures have no percentage.
+ # We could look for '//' in the string to report a better error message.
+ message = "end of formula"
+ return message
+
+def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
+ """
+ Parse a chemical formula, returning a structure with elements from the
+ given periodic table.
+ """
+ cleanup = StripPunctuation()
+ convert = ConvertTokens(formula_str, table=table)
+ try:
+ tree = formula_parser.parse(formula_str)
+ except lark.exceptions.UnexpectedCharacters as exc:
+ #import pprint; pprint.pprint(exc.__dict__)
+ context = exc.get_context(formula_str).rstrip()
+ #context = exc._context.rstrip()
+ message = f"Expected {_allowed(exc.allowed)} in\n{context}"
+ raise ValueError(message)
+ except lark.exceptions.UnexpectedEOF as exc:
+ # import pprint; pprint.pprint(exc.__dict__)
+ context = exc.get_context(formula_str).rstrip()
+ message = f"Expected {_allowed(exc.expected)} in\n{context}"
+ raise ValueError(message)
+ except Exception as exc:
+ # TODO: are other exceptions possible from the Earley parser?
+ raise exc from None
+ tree = cleanup.transform(tree)
+ try:
+ tree = convert.transform(tree)
+ except lark.exceptions.VisitError as exc:
+ # Unwind the VistorError exception capture and reraise the original exception
+ # This requires that error messages in the transformer give enough context to
+ # correct the error.
+ raise exc.orig_exc from None
+ return tree
+
+# Error conditions are marked with '!' so the exception is ignored
+# Lines marked ## fail on the pyparsing parser
+examples = """
+
+# === Composite tests ===
+Co
+H2SO4
+CaCO3
+CaCO₃
+(Co@5) ##
+(((Co@5)@6)) ##
+CaCO3+6H2O
+CaCO3 6H2O
+CaCO3(H2O)6
+CaCO3 (H2O)6
+(Ca(CO3)((H2O)6))
+CaCO₃·6H₂O ##
+! Bl2Oh # bad symbol
+! (Co # mismatched LPAR
+! Co) # mismatched RPAR
+! ((Co) # mismatched LPAR
+! ₃H2O # badly placed subscript
+
+# === Isotope tests ===
+DHO
+H[1]
+¹⁸O₂
+! Fe[56O2 # bad isotope syntax
+! Co[181] # bad isotope
+
+# === Valence tests ===
+Ca{2+}
+Ca{++}
+Ca⁺⁺ ##
+O{2-}
+O{--}
+O²⁻ ##
+H{+}
+H{-}
+HO{1-} # HO- applies to the group, but valence is attached to O
+H[1]{1-}O
+²H⁺ # D{+} ##
+O²H⁻ # no ambiguity since valence requires a trailing + or - ##
+O²⁻H⁺ # O{2-}H{+} ##
+O²⁻²H⁺ # O{2-}D{+} ##
+! Ca{2} # missing charge in valence
+! Ca{2++} # can't use number++
+! Ca{2+O2 # missing close brace on valence
+! Co{17-} # bad valence value
+! Ca ⁺⁺ # extra space before valence
+! Ca++ # missing braces in valence: the + is acting as SEPARATOR
+! Ca2+ # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR
+! O² # Should be looking for SUPERCHARGE (e.g., O²⁻) or SYMBOL (e.g., O²H)
+
+# === Density tests ===
+H2O@1 # density is 1, where H and O use natural abundance
+H2O @ 1 # spaces allowed around '@' ##
+D2O@1n # natural density "n" is 1 so isotopic density is 1.11
+D2O@1.11i # isotopic density is 1.11
+D2O@1.11 # default is "i" for isotopic density
+C3H4H[1]NO@1.29n # another natural density example
+78.2H2O[16] + 21.8H2O[18] @1n # density applies to composite
+! 3g Ca@ // 5g Si # missing density value
+! Ca@i # missing density value ##
+! H2O@1h # bad density mode
+
+# === Mixture tests ===
+50 wt% Co // Ti # mix by mass; final component does need percentage
+33 wt% Co // 33% Fe // Ti # intermediate components need percentage
+! 93 wt% Co // 33% Fe // Ti # more than 100 wt%
+! 93 vol% Co // 33% Fe // Ti # more than 100 vol%
+20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n
+5g NaCl // 50mL H2O@1 # volume components need density to determine mass fraction
+5g NaCl@2.16 // 50mL H2O@1 # need component densities to estimate mixture density
+NaCl(H2O)29.1966(D2O)122.794@1.10i # mixture rendered as formula
+! 5g NaCl // 50mL H2O # need density for H2O to convert volume to mass
+(10 wt% NaCl // H2O)@1.07n # set density of a mixture
+50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n
+1 cm Si // 5 nm Cr // 10 nm Au
+! 4 nm NaCl@2.17// 50 g Si # can't use mass in layer mixture
+! 3..5 mg NaCl # bad number format
+! 5 Mg NaCl // 50mL H2O@1 # bad units
+! 3.5 fm Si # bad units; expecting wt%/vol% or LENGTH, VOLUME, MASS
+! 3.5 mm Si // 2.5 nm SiO2 // # missing final component of mixture
+! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG # bad final component of mixture
+! // 3g Ca # // is not a comment
+! 37 vol% H2O@1 / 5% D2O@1 # missing /
+! 37 vol% H2O@1 /// 5% D2O@1 # extra /
+! 37 vol% NaCl@2.16 // H2O@1 // D2O@1 # percent missing in middle part
+! 37 vol% H2O@1 // 5% D2O@1 # percent not allowed in last part
+! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts
+! 37% H2O@1 // D2O@1 # missing vol% or wt%
+! 37 val% H2O@1 // D2O@1 # bad spelling of vol%
+
+# === FASTA tests ===
+dna:CAGT
+dna:CAGT @1n # can override the density of a FASTA sequence
+aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
+! DNA:CAGT # incorrect case for FASTA type
+! dna CAGT # missing colon between FASTA type and sequence
+! bad:CAGT # bad FASTA sequence type
+
+"""
+
+def check():
+ for line in examples.split('\n'):
+ formula = line.split('#')[0]
+ bad = line.startswith('!')
+ if bad:
+ formula = formula[1:]
+ if formula.strip():
+ print()
+ if bad:
+ print(f"!!! {line[1:]}")
+ else:
+ print(f"*** {line}")
+ try:
+ tree = parse_formula(formula)
+ density = getattr(tree, 'density', None)
+ density_str = f" @ {density:.2f}" if density else ""
+ mode = 'unicode' # unicode latex html plain
+ # mode = 'plain'
+ print(f" => {pretty_formula(tree, mode)}{density_str}")
+ # print(f" {getattr(tree, 'structure', None)}")
+ except Exception as exc:
+ if bad:
+ print(f"{exc}")
+ else:
+ raise exc from None
+ else:
+ if '##' in line:
+ continue # pyparsing should fail but doesn't
+ if bad:
+ raise RuntimeError(f"Exception not raised for <{formula}>")
+ else:
+ print(line)
+
+def main():
+ import sys
+
+ if len(sys.argv) > 1:
+ for arg in sys.argv[1:]:
+ formula = parse_formula(arg)
+ mass = f" {formula.total_mass:.4g} g" if formula.total_mass else ""
+ density = f"@{formula.density:.4g}" if formula.density else ""
+ thickness = f" {10*formula.thickness:.4g} mm" if formula.thickness else ""
+ print(f"{formula}{density}{mass}{thickness}")
+ else:
+ check()
+
+if __name__ == "__main__":
+ main()
diff --git a/periodictable/util.py b/periodictable/util.py
index d7fa8ec..5884f1f 100644
--- a/periodictable/util.py
+++ b/periodictable/util.py
@@ -53,6 +53,78 @@ def parse_uncertainty(s: str) -> tuple[float, float]|tuple[None, None]:
# Plain value with no uncertainty
return float(s), 0
+def from_subscript(value: str) -> str:
+ """
+ Convert unicode subscript characters to normal characters. This allows us to parse,
+ for example, H₂O as H2O.
+ """
+ codepoints = {
+ '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
+ '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7',
+ '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-',
+ '\u208c': '=', '\u208d': '(', '\u208e': ')',
+
+ '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x',
+ '\u2095': 'h', '\u2096': 'k', '\u2097': 'l',
+ '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's',
+ '\u209c': 't',
+ }
+ return ''.join(codepoints.get(char, char) for char in str(value))
+
+def from_superscript(value: str) -> str:
+ """
+ Convert unicode superscript characters to normal characters. This allows us to parse,
+ for example, Ca²⁺ as Ca{2+}.
+ """
+ codepoints = {
+ '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
+ '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
+ '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
+ '\u207c': '=', '\u207d': '(', '\u207e': ')',
+
+ '\u2071': 'i', '\u207f': 'n',
+ }
+ return ''.join(codepoints.get(char, char) for char in str(value))
+
+def unicode_subscript(value: str) -> str:
+ # Unicode subscript codepoints. Note that decimal point looks okay as subscript
+ codepoints = {
+ '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083',
+ '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087',
+ '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b',
+ '=': '\u208c', '(': '\u208d', ')': '\u208e',
+
+ 'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093',
+ 'h': '\u2095', 'k': '\u2096', 'l': '\u2097',
+ 'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b',
+ 't': '\u209c',
+
+ '\u2013': '\u208b', # en-dash is same as dash
+ '\u2014': '\u208b', # em-dash is same as dash
+ }
+ return ''.join(codepoints.get(char, char) for char in str(value))
+
+def unicode_superscript(value: str) -> str:
+ # Unicode subscript codepoints. Note that decimal point looks okay as subscript
+ codepoints = {
+ #'.': '\u00B0', # degree symbol looks too much like zero
+ #'.': ' \u02D9', # dot above modifier looks okay in a floating string, but risky
+ #'.': ' \u0307', # space with dot above?
+ #'.': '\u22C5', # math dot operator
+ '.': '\u1427', # Canadian aboriginal extended block dot (looks good on mac)
+ '2': '\u00B2', '3': '\u00B3',
+ '1': '\u00B9',
+ '0': '\u2070', 'i': '\u2071',
+ '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077',
+ '8': '\u2078', '9': '\u2079', '+': '\u207a', '-': '\u207b',
+ '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f',
+
+ '\u2013': '\u207b', # en-dash is same as dash
+ '\u2014': '\u207b', # em-dash is same as dash
+ }
+ return ''.join(codepoints.get(char, char) for char in str(value))
+
+
def cell_volume(a=None, b=None, c=None, alpha=None, beta=None, gamma=None) -> float:
r"""
Compute cell volume from lattice parameters.
diff --git a/pyproject.toml b/pyproject.toml
index 1b651ef..a327bf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,8 @@
]
license = { file = "LICENSE.txt" }
dependencies = [
- "pyparsing>=3.0.0", "numpy",
+ "numpy",
+ "lark",
]
classifiers = [
@@ -36,6 +37,8 @@
{include-group = "build"},
{include-group = "docs"},
{include-group = "test"},
+ # TODO: Shouldn't have to copy base dependencies here...is there a better way?
+ "numpy", "lark",
]
[project.urls]
diff --git a/test/test_core.py b/test/test_core.py
index d010a76..c86b9a0 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -64,7 +64,7 @@ def test():
Fe.ion[-3]
raise Exception("accepts invalid ions")
except ValueError as msg:
- assert str(msg) == "-3 is not a valid charge for Fe"
+ assert str(msg) == "valence 3- is not valid for Fe"
assert data_files()[0][0] == "periodictable-data/xsf"
diff --git a/test/test_formulas.py b/test/test_formulas.py
index eda94de..4eb2b9e 100644
--- a/test/test_formulas.py
+++ b/test/test_formulas.py
@@ -14,12 +14,19 @@ def check_parse_fails(s):
raise Exception(f'formula("{s}") should fail to parse')
def test():
+ # CaCO3(H2O)6 is a tuple of (count, atom) followed by (6, H2O)
+ # CaCO3+6H2O is ((1, CaCO3), (6, H2O))
ikaite = formula()
- # Note: this should be a tuple of tuples
ikaite.structure = ((1, Ca), (1, C), (3, O), (6, ((2, H), (1, O))))
+ ikaite.name = "CaCO3(H2O)6"
+ ikaite_grouped = formula()
+ ikaite_grouped.structure = ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O))))
+ ikaite_grouped.name = "CaCO3+6H2O"
# Test print
assert str(ikaite) == "CaCO3(H2O)6"
+ assert str(ikaite_grouped) == "CaCO3(H2O)6"
+ # TODO: parsing a printed structure should produce the same structure
# Test constructors
assert ikaite == formula([(1, Ca), (1, C), (3, O), (6, [(2, H), (1, O)])])
@@ -31,9 +38,9 @@ def test():
assert formula("Ca") == formula([(1, Ca)])
assert formula("Ca") == formula(Ca)
assert formula("CaCO3") == formula([(1, Ca), (1, C), (3, O)])
- assert ikaite == formula("CaCO3+6H2O")
- assert ikaite == formula("(CaCO3+6H2O)1")
- assert ikaite == formula("CaCO3 6H2O")
+ assert ikaite_grouped == formula("CaCO3+6H2O")
+ assert ikaite_grouped == formula("(CaCO3+6H2O)1")
+ assert ikaite_grouped == formula("CaCO3 6H2O")
assert ikaite == formula("CaCO3(H2O)6")
assert ikaite == formula("(CaCO3(H2O)6)1")
assert ikaite.hill == formula("CCaO3(H2O)6").hill
@@ -43,7 +50,7 @@ def test():
# Unicode, latex and html subscripts
assert formula([(0.75, Fe), (0.25, Ni)]) == formula("Fe₀.₇₅Ni₀.₂₅")
assert ikaite == formula("CaCO₃(H₂O)₆")
- assert ikaite == formula("CaCO₃6H₂O") # with subscripts we know it isn't O36
+ assert ikaite_grouped == formula("CaCO₃ 6H₂O") # with subscripts we know it isn't O36
assert pretty(ikaite, 'unicode') == "CaCO₃(H₂O)₆"
assert pretty(ikaite, 'html') == "CaCO3(H2O)6"
assert pretty(ikaite, 'latex') == "CaCO$_{3}$(H$_{2}$O)$_{6}$"
@@ -116,14 +123,15 @@ def test():
# Check that names work
permalloy = formula('Ni8Fe2', 8.692, name='permalloy')
- assert str(permalloy) == 'permalloy'
+ assert str(permalloy) == 'Ni8Fe2'
+ assert permalloy.name == 'permalloy'
# Check that get/restore state works
assert deepcopy(permalloy).__dict__ == permalloy.__dict__
# Check that copy constructor works
- #print permalloy.__dict__
- #print formula(permalloy).__dict__
+ # print(permalloy.__dict__)
+ # print(formula(permalloy).__dict__)
assert formula(permalloy).__dict__ == permalloy.__dict__
assert formula('Si', name='Silicon').__dict__ != formula('Si').__dict__